You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by rf...@apache.org on 2011/11/23 03:28:45 UTC
svn commit: r1205272 - in
/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika:
EchoHandler.java TikaDocumentParser.java TikaHtmlParser.java api/
api/TikaParse.java api/TikaParser.java parse/ parse/TikaParseImpl.java
Author: rfrovarp
Date: Wed Nov 23 03:28:44 2011
New Revision: 1205272
URL: http://svn.apache.org/viewvc?rev=1205272&view=rev
Log:
Start working on new Tika functionality.
Added:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParser.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/parse/
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java
Removed:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1205272&r1=1205271&r2=1205272&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java Wed Nov 23 03:28:44 2011
@@ -23,10 +23,10 @@ import java.io.InputStream;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
-import org.apache.droids.api.Parse;
-import org.apache.droids.api.Parser;
import org.apache.droids.exception.DroidsException;
-import org.apache.droids.parse.ParseImpl;
+import org.apache.droids.tika.api.TikaParse;
+import org.apache.droids.tika.api.TikaParser;
+import org.apache.droids.tika.parse.TikaParseImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@@ -34,10 +34,10 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
-public class TikaDocumentParser implements Parser {
+public class TikaDocumentParser implements TikaParser {
@Override
- public Parse parse(ContentEntity entity, Link link) throws DroidsException,
+ public TikaParse parse(ContentEntity entity, Link link) throws DroidsException,
IOException {
org.apache.tika.parser.Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
@@ -46,7 +46,7 @@ public class TikaDocumentParser implemen
InputStream instream = entity.obtainContent();
try {
parser.parse(instream, handler, metadata, new ParseContext());
- ParseImpl parse = new ParseImpl(handler.toString(),null);
+ TikaParseImpl parse = new TikaParseImpl(handler.toString(),null);
return parse;
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1205272&r1=1205271&r2=1205272&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Wed Nov 23 03:28:44 2011
@@ -18,16 +18,23 @@ package org.apache.droids.tika;
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
-import org.apache.droids.api.Parse;
-import org.apache.droids.api.Parser;
import org.apache.droids.exception.DroidsException;
-import org.apache.droids.parse.ParseImpl;
import org.apache.droids.parse.html.LinkExtractor;
+import org.apache.droids.tika.api.TikaParse;
+import org.apache.droids.tika.api.TikaParser;
+import org.apache.droids.tika.parse.TikaParseImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@@ -35,7 +42,7 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.SAXException;
-public class TikaHtmlParser implements Parser {
+public class TikaHtmlParser implements TikaParser {
private Map<String, String> elements= null;
@@ -51,7 +58,7 @@ public class TikaHtmlParser implements P
}
@Override
- public Parse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
+ public TikaParse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
// Init Tika objects
org.apache.tika.parser.Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
@@ -60,16 +67,28 @@ public class TikaHtmlParser implements P
if (charset == null) {
charset = "UTF-8";
}
- EchoHandler data = new EchoHandler(charset);
+
+ StringWriter dataBuffer = new StringWriter();
+
+ SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+ TransformerHandler xmlHandler;
+ try {
+ xmlHandler = factory.newTransformerHandler();
+ } catch (TransformerConfigurationException e) {
+ throw new DroidsException(e);
+ }
+ xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ xmlHandler.setResult(new StreamResult(dataBuffer));
+
LinkExtractor extractor = new LinkExtractor(link, elements);
- TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);
+ TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, extractor);
InputStream instream = entity.obtainContent();
try {
parser.parse(instream, parallelHandler, metadata, new ParseContext());
- return new ParseImpl(data.toString(), extractor.getLinks());
+ return new TikaParseImpl(dataBuffer.toString(), extractor.getLinks());
} catch (SAXException ex) {
throw new DroidsException("Failure parsing document " + link.getId(), ex);
} catch (TikaException ex) {
Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java?rev=1205272&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java Wed Nov 23 03:28:44 2011
@@ -0,0 +1,7 @@
+package org.apache.droids.tika.api;
+
+import org.apache.droids.api.Parse;
+
+public interface TikaParse extends Parse {
+
+}
Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParser.java?rev=1205272&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParser.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParser.java Wed Nov 23 03:28:44 2011
@@ -0,0 +1,7 @@
+package org.apache.droids.tika.api;
+
+import org.apache.droids.api.Parser;
+
+public interface TikaParser extends Parser {
+
+}
Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java?rev=1205272&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java Wed Nov 23 03:28:44 2011
@@ -0,0 +1,19 @@
+package org.apache.droids.tika.parse;
+
+import java.util.Collection;
+
+import org.apache.droids.api.Link;
+import org.apache.droids.parse.ParseImpl;
+import org.apache.droids.tika.api.TikaParse;
+
+public class TikaParseImpl extends ParseImpl implements TikaParse {
+
+ public TikaParseImpl(String text, Collection<Link> outlinks) {
+ super(text,outlinks);
+ }
+
+ public TikaParseImpl(String text, Object data, Collection<Link> outlinks) {
+ super(text,data,outlinks);
+ }
+
+}