You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by to...@apache.org on 2013/02/12 16:40:00 UTC

svn commit: r1445252 - in /incubator/droids/branches/0.2.x-cleanup/droids-tika: ./ src/main/java/org/apache/droids/tika/ src/main/java/org/apache/droids/tika/api/ src/main/java/org/apache/droids/tika/parse/ src/test/java/org/apache/droids/tika/ src/tes...

Author: tobr
Date: Tue Feb 12 16:39:59 2013
New Revision: 1445252

URL: http://svn.apache.org/r1445252
Log:
updated tika module

Added:
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java
      - copied, changed from r1441429, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java   (with props)
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java
      - copied, changed from r1444858, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java   (with props)
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt   (with props)
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml   (with props)
Removed:
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/
    incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
Modified:
    incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml?rev=1445252&r1=1445251&r2=1445252&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml Tue Feb 12 16:39:59 2013
@@ -26,13 +26,13 @@
         <relativePath>../pom.xml</relativePath>
     </parent>
     <artifactId>droids-tika</artifactId>
-    <name>Apache Droids Tika</name>
+    <name>APACHE DROIDS TIKA PARSER</name>
     <inceptionYear>2007</inceptionYear>
-    <description>Apache Droids Tika Parser</description>
+    <description>Apache Droids Parser using Apache Tika</description>
     <packaging>jar</packaging>
 
     <properties>
-        <tika-release-version>1.1</tika-release-version>
+        <tika.version>1.3</tika.version>
     </properties>
 
     <dependencies>
@@ -44,12 +44,12 @@
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-core</artifactId>
-            <version>${tika-release-version}</version>
+            <version>${tika.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-parsers</artifactId>
-            <version>${tika-release-version}</version>
+            <version>${tika.version}</version>
             <exclusions>
                 <exclusion>
                     <artifactId>commons-logging</artifactId>
@@ -64,5 +64,11 @@
             <version>${junit.version}</version>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>ch.qos.logback</groupId>
+            <artifactId>logback-classic</artifactId>
+            <version>${logback.version}</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 </project>

Copied: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java (from r1441429, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java&r1=1441429&r2=1445252&rev=1445252&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParser.java Tue Feb 12 16:39:59 2013
@@ -1,21 +1,21 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.droids.tika;
 
 import java.io.IOException;
@@ -23,7 +23,8 @@ import java.io.InputStream;
 import java.io.StringWriter;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.TransformerConfigurationException;
@@ -31,20 +32,17 @@ import javax.xml.transform.sax.SAXTransf
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
-import org.apache.droids.LinkTask;
-import org.apache.droids.api.ContentEntity;
-import org.apache.droids.api.Link;
-import org.apache.droids.api.Task;
-import org.apache.droids.exception.DroidsException;
-import org.apache.droids.tika.api.TikaParse;
-import org.apache.droids.tika.api.TikaParser;
-import org.apache.droids.tika.parse.TikaParseImpl;
+import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.LinkedTask;
+import org.apache.droids.core.Task;
+import org.apache.droids.core.Parser;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.*;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.Link;
 import org.apache.tika.sax.LinkContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
 import org.slf4j.Logger;
@@ -56,22 +54,16 @@ import org.xml.sax.SAXException;
  * Any document type that Tika can handle, can be handled by this class,
  * including HTML.
  */
-public class TikaDocumentParser implements TikaParser {
+public class TikaParser<T extends Task> implements Parser<T> {
 
-    protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class);
+    protected static final Logger logger = LoggerFactory.getLogger(TikaParser.class);
 
     @Override
-    public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
-            IOException {
+    public void parse(T task) throws DroidsException, IOException {
         // Init Tika objects
         org.apache.tika.parser.Parser parser = new AutoDetectParser();
         Metadata metadata = new Metadata();
 
-        String charset = entity.getCharset();
-        if (charset == null) {
-            charset = "UTF-8";
-        }
-
         StringWriter dataBuffer = new StringWriter();
         StringWriter bodyBuffer = new StringWriter();
         StringWriter mainContentBuffer = new StringWriter();
@@ -92,34 +84,43 @@ public class TikaDocumentParser implemen
 
         TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler);
 
-        InputStream instream = entity.obtainContent();
+        InputStream instream = task.getContentEntity().getContent();
         try {
             parser.parse(instream, parallelHandler, metadata, new ParseContext());
 
-            ArrayList<Link> extractedTasks = new ArrayList<Link>();
-            int depth = task.getDepth() + 1;
-            if (task instanceof LinkTask) {
-                for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+            if (task instanceof LinkedTask) {
+                Set<LinkedTask> extractedTasks = new HashSet<LinkedTask>();
+                for (Link tikaLink : linkHandler.getLinks()) {
                     try {
                         URI uri = new URI(tikaLink.getUri());
                         // Test to see if the scheme is empty
                         // This would indicate a relative URL, so resolve it against the task URI
                         if (uri.getScheme() == null) {
-                            uri = ((Link) task).getURI().resolve(uri);
+                            uri = task.getURI().resolve(uri);
                         }
-                        extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText()));
+                        LinkedTask newTask = task.createTask(uri);
+                        newTask.getParserData().set(TikaParserData.ANCHOR_TEXT, tikaLink.getText());
+                        newTask.getParserData().set(TikaParserData.ANCHOR_TITLE, tikaLink.getTitle());
+                        extractedTasks.add(newTask);
                     } catch (URISyntaxException e) {
-                        if (LOG.isWarnEnabled()) {
-                            LOG.warn("URI not valid: " + tikaLink.getUri());
+                        if (logger.isWarnEnabled()) {
+                            logger.warn("URI not valid: " + tikaLink.getUri());
                         }
                     }
                 }
+                ((LinkedTask) task).setTo(extractedTasks);
+            }
+
+            task.getParserData().set(TikaParserData.XML_CONTENT, dataBuffer.toString());
+            task.getParserData().set(TikaParserData.TEXT, bodyBuffer.toString());
+            task.getParserData().set(TikaParserData.MAIN_CONTENT, mainContentBuffer.toString());
+            for (String key : metadata.names()) {
+                task.getParserData().set(key, metadata.getValues(key));
             }
-            return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
         } catch (SAXException ex) {
-            throw new DroidsException("Failure parsing document " + task.getId(), ex);
+            throw new DroidsException("Failure parsing document " + task.getURI(), ex);
         } catch (TikaException ex) {
-            throw new DroidsException("Failure parsing document " + task.getId(), ex);
+            throw new DroidsException("Failure parsing document " + task.getURI(), ex);
         } finally {
             instream.close();
         }

Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java?rev=1445252&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java Tue Feb 12 16:39:59 2013
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.droids.tika;
+
+import org.apache.droids.parse.LinkedParserData;
+import org.apache.tika.metadata.*;
+
+import java.io.Serializable;
+
+/**
+ * Container for data extracted by tika using the names
+ * of the tika interfaces to store the data in the ParserData.
+ */
+public class TikaParserData extends LinkedParserData implements CreativeCommons,
+        Geographic, HttpHeaders, Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys,
+        TikaCoreProperties, TikaMimeKeys, Serializable {
+
+    public final static String XML_CONTENT = "xmlcontent";
+    public final static String MAIN_CONTENT = "maincontent";
+
+}

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Revision

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaParserData.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Copied: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java (from r1444858, incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java&r1=1444858&r2=1445252&rev=1445252&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaParserTest.java Tue Feb 12 16:39:59 2013
@@ -16,12 +16,32 @@
  */
 package org.apache.droids.tika;
 
-import junit.framework.TestCase;
+import junit.framework.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-public class TikaHtmlParserTest extends TestCase {
+import java.io.InputStream;
 
-    public void testSomething() throws Exception {
-        // TODO -- test stuff!
-        assertTrue(true);
+public class TikaParserTest {
+    private final static Logger logger = LoggerFactory.getLogger(TikaParserTest.class);
+
+
+    @Test
+    public void testParse() throws Exception {
+        TikaTask task = new TikaTask(null, 0);
+        InputStream content = ClassLoader.getSystemResourceAsStream("droids-test.odt");
+        task.getContentEntity().setContent(content);
+
+        TikaParser<TikaTask> parser = new TikaParser<TikaTask>();
+        parser.parse(task);
+
+        TikaParserData data = task.getParserData();
+/*
+        for (String name : data.names()) {
+            logger.info(name + ": " + data.get(name));
+        }
+*/
+        Assert.assertEquals("Droids Tika Test", data.get(TikaParserData.TITLE.getName()));
     }
 }
\ No newline at end of file

Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java?rev=1445252&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java Tue Feb 12 16:39:59 2013
@@ -0,0 +1,76 @@
+package org.apache.droids.tika;
+
+import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.LinkedTask;
+
+import java.net.URI;
+import java.util.Date;
+
+/**
+ *
+ *
+ *
+ */
+public class TikaTask extends LinkedTask {
+    private ContentEntity contentEntity;
+    private TikaParserData parserData;
+    private URI uri;
+    private int depth;
+    private boolean aborted;
+
+
+    public TikaTask(URI uri, int depth) {
+        this.uri = uri;
+        this.depth = depth;
+        this.aborted = false;
+        this.contentEntity = new ContentEntity();
+        this.parserData = new TikaParserData();
+
+    }
+
+    @Override
+    public LinkedTask getFrom() {
+        return null;
+    }
+
+    @Override
+    public URI getURI() {
+        return uri;
+    }
+
+    @Override
+    public ContentEntity getContentEntity() {
+        return contentEntity;
+    }
+
+    @Override
+    public TikaParserData getParserData() {
+        return parserData;
+    }
+
+    @Override
+    public int getDepth() {
+        return depth;
+    }
+
+    @Override
+    public Date getTaskDate() {
+        return null;
+    }
+
+    @Override
+    public void abort() {
+        aborted = true;
+    }
+
+    @Override
+    public boolean isAborted() {
+        return aborted;
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public TikaTask createTask(URI uri) {
+        return new TikaTask(uri, depth + 1);
+    }
+}

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Revision

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaTask.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt?rev=1445252&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/droids-test.odt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml?rev=1445252&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml Tue Feb 12 16:39:59 2013
@@ -0,0 +1,17 @@
+<configuration>
+
+    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+        <!-- encoders are assigned the type
+             ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
+        <encoder>
+            <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
+        </encoder>
+    </appender>
+
+    <logger name="org.apache.droids" level="info"/>
+
+
+    <root level="info">
+        <appender-ref ref="STDOUT"/>
+    </root>
+</configuration>
\ No newline at end of file

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Revision

Propchange: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/resources/logback.xml
------------------------------------------------------------------------------
    svn:mime-type = text/xml