You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by to...@apache.org on 2013/02/12 16:40:00 UTC
svn commit: r1445252 - in
/incubator/droids/branches/0.2.x-cleanup/droids-tika: ./
src/main/java/org/apache/droids/tika/
src/main/java/org/apache/droids/tika/api/
src/main/java/org/apache/droids/tika/parse/
src/test/java/org/apache/droids/tika/ src/tes...
Author: tobr
Date: Tue Feb 12 16:39:59 2013
New Revision: 1445252
URL: http://svn.apache.org/r1445252
Log:
updated tika module
Added:
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java
- copied, changed from r1441429, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java
- copied, changed from r1444858, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt (with props)
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml (with props)
Removed:
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/
incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml?rev=1445252&r1=1445251&r2=1445252&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml Tue Feb 12 16:39:59 2013
@@ -26,13 +26,13 @@
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>droids-tika</artifactId>
- <name>Apache Droids Tika</name>
+ <name>APACHE DROIDS TIKA PARSER</name>
<inceptionYear>2007</inceptionYear>
- <description>Apache Droids Tika Parser</description>
+ <description>Apache Droids Parser using Apache Tika</description>
<packaging>jar</packaging>
<properties>
- <tika-release-version>1.1</tika-release-version>
+ <tika.version>1.3</tika.version>
</properties>
<dependencies>
@@ -44,12 +44,12 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>${tika-release-version}</version>
+ <version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
- <version>${tika-release-version}</version>
+ <version>${tika.version}</version>
<exclusions>
<exclusion>
<artifactId>commons-logging</artifactId>
@@ -64,5 +64,11 @@
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ <version>${logback.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
Copied: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java (from r1441429, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java&r1=1441429&r2=1445252&rev=1445252&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java Tue Feb 12 16:39:59 2013
@@ -1,21 +1,21 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied. See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.droids.tika;
import java.io.IOException;
@@ -23,7 +23,8 @@ import java.io.InputStream;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
-import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
@@ -31,20 +32,17 @@ import javax.xml.transform.sax.SAXTransf
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
-import org.apache.droids.LinkTask;
-import org.apache.droids.api.ContentEntity;
-import org.apache.droids.api.Link;
-import org.apache.droids.api.Task;
-import org.apache.droids.exception.DroidsException;
-import org.apache.droids.tika.api.TikaParse;
-import org.apache.droids.tika.api.TikaParser;
-import org.apache.droids.tika.parse.TikaParseImpl;
+import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.LinkedTask;
+import org.apache.droids.core.Task;
+import org.apache.droids.core.Parser;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.*;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.slf4j.Logger;
@@ -56,22 +54,16 @@ import org.xml.sax.SAXException;
* Any document type that Tika can handle, can be handled by this class,
* including HTML.
*/
-public class TikaDocumentParser implements TikaParser {
+public class TikaParser<T extends Task> implements Parser<T> {
- protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class);
+ protected static final Logger logger = LoggerFactory.getLogger(TikaParser.class);
@Override
- public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
- IOException {
+ public void parse(T task) throws DroidsException, IOException {
// Init Tika objects
org.apache.tika.parser.Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
- String charset = entity.getCharset();
- if (charset == null) {
- charset = "UTF-8";
- }
-
StringWriter dataBuffer = new StringWriter();
StringWriter bodyBuffer = new StringWriter();
StringWriter mainContentBuffer = new StringWriter();
@@ -92,34 +84,43 @@ public class TikaDocumentParser implemen
TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler);
- InputStream instream = entity.obtainContent();
+ InputStream instream = task.getContentEntity().getContent();
try {
parser.parse(instream, parallelHandler, metadata, new ParseContext());
- ArrayList<Link> extractedTasks = new ArrayList<Link>();
- int depth = task.getDepth() + 1;
- if (task instanceof LinkTask) {
- for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ if (task instanceof LinkedTask) {
+ Set<LinkedTask> extractedTasks = new HashSet<LinkedTask>();
+ for (Link tikaLink : linkHandler.getLinks()) {
try {
URI uri = new URI(tikaLink.getUri());
// Test to see if the scheme is empty
// This would indicate a relative URL, so resolve it against the task URI
if (uri.getScheme() == null) {
- uri = ((Link) task).getURI().resolve(uri);
+ uri = task.getURI().resolve(uri);
}
- extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText()));
+ LinkedTask newTask = task.createTask(uri);
+ newTask.getParserData().set(TikaParserData.ANCHOR_TEXT, tikaLink.getText());
+ newTask.getParserData().set(TikaParserData.ANCHOR_TITLE, tikaLink.getTitle());
+ extractedTasks.add(newTask);
} catch (URISyntaxException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("URI not valid: " + tikaLink.getUri());
+ if (logger.isWarnEnabled()) {
+ logger.warn("URI not valid: " + tikaLink.getUri());
}
}
}
+ ((LinkedTask) task).setTo(extractedTasks);
+ }
+
+ task.getParserData().set(TikaParserData.XML_CONTENT, dataBuffer.toString());
+ task.getParserData().set(TikaParserData.TEXT, bodyBuffer.toString());
+ task.getParserData().set(TikaParserData.MAIN_CONTENT, mainContentBuffer.toString());
+ for (String key : metadata.names()) {
+ task.getParserData().set(key, metadata.getValues(key));
}
- return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
} catch (SAXException ex) {
- throw new DroidsException("Failure parsing document " + task.getId(), ex);
+ throw new DroidsException("Failure parsing document " + task.getURI(), ex);
} catch (TikaException ex) {
- throw new DroidsException("Failure parsing document " + task.getId(), ex);
+ throw new DroidsException("Failure parsing document " + task.getURI(), ex);
} finally {
instream.close();
}
Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java?rev=1445252&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java Tue Feb 12 16:39:59 2013
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.droids.tika;
+
+import org.apache.droids.parse.LinkedParserData;
+import org.apache.tika.metadata.*;
+
+import java.io.Serializable;
+
+/**
+ * Container for data extracted by tika using the names
+ * of the tika interfaces to store the data in the ParserData.
+ */
+public class TikaParserData extends LinkedParserData implements CreativeCommons,
+ Geographic, HttpHeaders, Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys,
+ TikaCoreProperties, TikaMimeKeys, Serializable {
+
+ public final static String XML_CONTENT = "xmlcontent";
+ public final static String MAIN_CONTENT = "maincontent";
+
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Copied: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java (from r1444858, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java&r1=1444858&r2=1445252&rev=1445252&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java Tue Feb 12 16:39:59 2013
@@ -16,12 +16,32 @@
*/
package org.apache.droids.tika;
-import junit.framework.TestCase;
+import junit.framework.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-public class TikaHtmlParserTest extends TestCase {
+import java.io.InputStream;
- public void testSomething() throws Exception {
- // TODO -- test stuff!
- assertTrue(true);
+public class TikaParserTest {
+ private final static Logger logger = LoggerFactory.getLogger(TikaParserTest.class);
+
+
+ @Test
+ public void testParse() throws Exception {
+ TikaTask task = new TikaTask(null, 0);
+ InputStream content = ClassLoader.getSystemResourceAsStream("droids-test.odt");
+ task.getContentEntity().setContent(content);
+
+ TikaParser<TikaTask> parser = new TikaParser<TikaTask>();
+ parser.parse(task);
+
+ TikaParserData data = task.getParserData();
+/*
+ for (String name : data.names()) {
+ logger.info(name + ": " + data.get(name));
+ }
+*/
+ Assert.assertEquals("Droids Tika Test", data.get(TikaParserData.TITLE.getName()));
}
}
\ No newline at end of file
Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java?rev=1445252&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java Tue Feb 12 16:39:59 2013
@@ -0,0 +1,76 @@
+package org.apache.droids.tika;
+
+import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.LinkedTask;
+
+import java.net.URI;
+import java.util.Date;
+
+/**
+ *
+ *
+ *
+ */
+public class TikaTask extends LinkedTask {
+ private ContentEntity contentEntity;
+ private TikaParserData parserData;
+ private URI uri;
+ private int depth;
+ private boolean aborted;
+
+
+ public TikaTask(URI uri, int depth) {
+ this.uri = uri;
+ this.depth = depth;
+ this.aborted = false;
+ this.contentEntity = new ContentEntity();
+ this.parserData = new TikaParserData();
+
+ }
+
+ @Override
+ public LinkedTask getFrom() {
+ return null;
+ }
+
+ @Override
+ public URI getURI() {
+ return uri;
+ }
+
+ @Override
+ public ContentEntity getContentEntity() {
+ return contentEntity;
+ }
+
+ @Override
+ public TikaParserData getParserData() {
+ return parserData;
+ }
+
+ @Override
+ public int getDepth() {
+ return depth;
+ }
+
+ @Override
+ public Date getTaskDate() {
+ return null;
+ }
+
+ @Override
+ public void abort() {
+ aborted = true;
+ }
+
+ @Override
+ public boolean isAborted() {
+ return aborted;
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public TikaTask createTask(URI uri) {
+ return new TikaTask(uri, depth + 1);
+ }
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt?rev=1445252&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml?rev=1445252&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml Tue Feb 12 16:39:59 2013
@@ -0,0 +1,17 @@
+<configuration>
+
+ <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+ <!-- encoders are assigned the type
+ ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
+ <encoder>
+ <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
+ </encoder>
+ </appender>
+
+ <logger name="org.apache.droids" level="info"/>
+
+
+ <root level="info">
+ <appender-ref ref="STDOUT"/>
+ </root>
+</configuration>
\ No newline at end of file
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
------------------------------------------------------------------------------
svn:mime-type = text/xml