You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ry...@apache.org on 2008/11/09 21:44:31 UTC
svn commit: r712559 - in /incubator/droids/trunk: ./ droids-spring/conf/
droids-tika/ droids-tika/src/ droids-tika/src/main/
droids-tika/src/main/java/ droids-tika/src/main/java/org/
droids-tika/src/main/java/org/apache/ droids-tika/src/main/java/org/a...
Author: ryan
Date: Sun Nov 9 13:44:30 2008
New Revision: 712559
URL: http://svn.apache.org/viewvc?rev=712559&view=rev
Log:
DROIDS-8: Adding a tika module
Added:
incubator/droids/trunk/droids-tika/
incubator/droids/trunk/droids-tika/pom.xml (with props)
incubator/droids/trunk/droids-tika/src/
incubator/droids/trunk/droids-tika/src/main/
incubator/droids/trunk/droids-tika/src/main/java/
incubator/droids/trunk/droids-tika/src/main/java/org/
incubator/droids/trunk/droids-tika/src/main/java/org/apache/
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (with props)
incubator/droids/trunk/droids-tika/src/main/resources/
incubator/droids/trunk/droids-tika/src/test/
incubator/droids/trunk/droids-tika/src/test/java/
incubator/droids/trunk/droids-tika/src/test/java/org/
incubator/droids/trunk/droids-tika/src/test/java/org/apache/
incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/
incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/
incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (with props)
incubator/droids/trunk/droids-tika/src/test/resources/
Removed:
incubator/droids/trunk/droids-spring/conf/
Modified:
incubator/droids/trunk/pom.xml
Added: incubator/droids/trunk/droids-tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/pom.xml?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/pom.xml (added)
+++ incubator/droids/trunk/droids-tika/pom.xml Sun Nov 9 13:44:30 2008
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <artifactId>droids</artifactId>
+ <groupId>org.apache.droids</groupId>
+ <version>0.1-incubating-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>droids-tika</artifactId>
+ <name>Droids Tika</name>
+ <inceptionYear>2007</inceptionYear>
+ <description>
+ Droids Tika Parser
+ </description>
+ <packaging>jar</packaging>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.droids</groupId>
+ <artifactId>droids-core</artifactId>
+ <version>${pom.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika</artifactId>
+ <version>0.2-SNAPSHOT</version>
+ </dependency>
+ </dependencies>
+
+</project>
Propchange: incubator/droids/trunk/droids-tika/pom.xml
------------------------------------------------------------------------------
svn:executable = *
Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java Sun Nov 9 13:44:30 2008
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import java.io.ByteArrayInputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Handler that write the events in a StringBuffer to save a XML
+ * representation.
+ */
+public class EchoHandler extends DefaultHandler {
+ protected String encoding = null;
+
+ private StringBuffer xmlBuffer = null;
+
+ private ByteArrayInputStream inputStream = null;
+
+ private byte[] bytes;
+
+ /**
+ * Set the encoding for the XML output.
+ *
+ * @param encoding
+ */
+ public EchoHandler(String encoding) {
+ if (null != encoding & !" ".equals(encoding)) {
+ this.encoding = encoding;
+ } else {
+ this.encoding = "UTF-8";
+ }
+ }
+
+ /*
+ * Receive notification of the beginning of a document. (non-Javadoc)
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#startDocument()
+ */
+ public void startDocument() throws SAXException {
+ xmlBuffer = new StringBuffer();
+ xmlBuffer.append("<?xml version=\"1.0\" encoding=\"" + encoding + "\"?>\r\n");
+ }
+
+ /*
+ * Receive notification of the end of a document. (non-Javadoc)
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#endDocument()
+ */
+ public void endDocument() throws SAXException {
+ try {
+ setResult();
+ } catch (UnsupportedEncodingException e) {
+ throw new SAXException(e);
+ }
+ }
+
+ /*
+ * Receive notification of the beginning of an element.
+ *
+ * @param uri The Namespace URI, or the empty string if the element has no
+ * Namespace URI or if Namespace processing is not being performed. @param loc
+ * The local name (without prefix), or the empty string if Namespace
+ * processing is not being performed. @param raw The raw XML 1.0 name (with
+ * prefix), or the empty string if raw names are not available. @param atts
+ * The attributes attached to the element. If there are no attributes, it
+ * shall be an empty Attributes object.
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String,
+ * java.lang.String, java.lang.String, org.xml.sax.Attributes)
+ */
+ public void startElement(String uri, String loc, String raw, Attributes atts)
+ throws SAXException {
+ xmlBuffer.append("\r\n<" + raw);
+ for (int i = 0; i < atts.getLength(); i++) {
+ xmlBuffer.append(" ");
+ xmlBuffer.append(atts.getQName(i));
+ xmlBuffer.append("=\"");
+ String value = atts.getValue(i);
+
+ xmlBuffer.append(value);
+ xmlBuffer.append("\"");
+ }
+ xmlBuffer.append(">\r\n");
+ }
+
+ /*
+ * Receive notification of the end of an element.
+ *
+ * @param uri The Namespace URI, or the empty string if the element has no
+ * Namespace URI or if Namespace processing is not being performed. @param loc
+ * The local name (without prefix), or the empty string if Namespace
+ * processing is not being performed. @param raw The raw XML 1.0 name (with
+ * prefix), or the empty string if raw names are not available.
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String,
+ * java.lang.String, java.lang.String)
+ */
+ public void endElement(String uri, String loc, String raw)
+ throws SAXException {
+ xmlBuffer.append("\r\n</" + raw + ">\r\n");
+ }
+
+ /*
+ * Receive notification of character data.
+ *
+ * @param ch The characters from the XML document. @param start The start
+ * position in the array. @param length The number of characters to read from
+ * the array.
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#characters(char[], int, int)
+ */
+ public void characters(char ch[], int start, int length) throws SAXException {
+ for (int i = 0; i < length; i++) {
+ char c = ch[start + i];
+ if (c == '&') {
+ xmlBuffer.append("&");
+ } else if (c == '<') {
+ xmlBuffer.append("<");
+ } else if (c == '>') {
+ xmlBuffer.append(">");
+ } else {
+ xmlBuffer.append(c);
+ }
+ }
+ }
+
+ /*
+ * Receive notification of ignorable whitespace in element content.
+ *
+ * @param ch The characters from the XML document. @param start The start
+ * position in the array. @param length The number of characters to read from
+ * the array.
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#ignorableWhitespace(char[], int,
+ * int)
+ */
+ public void ignorableWhitespace(char ch[], int start, int length)
+ throws SAXException {
+ this.characters(ch, start, length);
+ }
+
+ /*
+ * Receive notification of a processing instruction.
+ *
+ * @param target The processing instruction target. @param data The processing
+ * instruction data, or null if none was supplied.
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#processingInstruction(java.lang.String,
+ * java.lang.String)
+ */
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ xmlBuffer.append("<?" + target + " " + data + "?>");
+ }
+
+ /*
+ * Receive notification of a skipped entity.
+ *
+ * @param name The name of the skipped entity. If it is a parameter entity,
+ * the name will begin with '%'.
+ *
+ * @see org.xml.sax.helpers.DefaultHandler#skippedEntity(java.lang.String)
+ */
+ public void skippedEntity(String name) throws SAXException {
+ xmlBuffer.append("&" + name + ";");
+ }
+
+ private void setResult() throws UnsupportedEncodingException {
+ try {
+ this.bytes=xmlBuffer.toString().getBytes(encoding);
+ this.inputStream = new ByteArrayInputStream(this.bytes);
+ } catch (UnsupportedEncodingException e) {
+ throw new UnsupportedEncodingException();
+ }
+ }
+
+ /**
+ * Return the underlying input stream
+ *
+ * @return the input stream
+ */
+ public ByteArrayInputStream getInputStream() {
+ return this.inputStream;
+ }
+
+ /**
+ * Return the underlying input stream
+ *
+ * @return the input stream
+ */
+ public byte[] getBytes() {
+ return this.bytes;
+ }
+
+ /**
+ * String representation of the data.
+ */
+ @Override
+ public String toString() {
+ return xmlBuffer.toString();
+ }
+}
\ No newline at end of file
Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java Sun Nov 9 13:44:30 2008
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.LinkTask;
+import org.apache.droids.api.Link;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class LinkExtractor extends DefaultHandler {
+
+ protected final Log log = LogFactory.getLog(this.getClass());
+
+ private Collection<Link> links = new ArrayList<Link>();;
+
+ private Map<String,String> elements;
+
+ private Link base = null;
+
+ private Set<String> history = new HashSet<String>();
+
+ @Override
+ public void startElement(String uri, String loc, String raw, Attributes att)
+ throws SAXException {
+ Iterator<String> it = elements.keySet().iterator();
+ String elem, linkAtt;
+ while(it.hasNext())
+ {
+ elem = it.next();
+ linkAtt = elements.get(elem);
+ if (elem.equalsIgnoreCase(loc) && att.getValue(linkAtt) != null) {
+
+ String link = att.getValue(linkAtt);
+ log.debug("Found element: " + elem + " with link: " + link);
+ //TODO: Buscar una forma genérica
+ //Matcher match = Pattern.compile("^(http://|/|\\.)").matcher(link);
+ if(!link.startsWith("#") && !link.equals("") && !link.startsWith("mailto"))
+ addOutlinkURI(link);
+ }
+ }
+ super.startElement(uri, loc, raw, att);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+ history = null;
+ }
+
+ public void addOutlinkURI(String target) {
+ URI uri = null;
+ URI from = base.getURI();
+ try {
+ String newUrl = "";
+ if (target.startsWith("/")) {
+ newUrl = from.getScheme() + "://" + from.getHost();
+ if (from.getPort() > -1) {
+ newUrl += ":" + from.getPort();
+ }
+ newUrl += target;
+ } else if (!target.toLowerCase().startsWith("javascript")) {
+ newUrl = new URL(from.toURL(), target).toString();
+ }
+ if (!newUrl.equals("")) {
+ String aux = target.contains(":/") ? target : newUrl;
+ // TODO:
+ aux = aux.split("#")[0];
+ uri = new URI(aux);
+ if(history == null)
+ history = new HashSet<String>();
+ if(links == null)
+ links = new ArrayList<Link>();
+ if (history.add(aux)) {
+ // TODO? aux??
+ links.add(new LinkTask(base, uri, base.getDepth()+1 ));
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void setBase(Link base) {
+ this.base = base;
+ }
+
+ public Collection<Link> getLinks() {
+ return links;
+ }
+
+ public Map<String, String> getElements() {
+ return elements;
+ }
+
+ public void setElements(Map<String, String> elements) {
+ this.elements = elements;
+ }
+}
Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Sun Nov 9 13:44:30 2008
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import java.io.InputStream;
+
+import org.apache.droids.ParseData;
+import org.apache.droids.api.Link;
+import org.apache.droids.api.Parse;
+import org.apache.droids.api.Parser;
+import org.apache.droids.helper.Loggable;
+import org.apache.droids.parse.ParseImpl;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.TeeContentHandler;
+
+public class TikaHtmlParser extends Loggable implements Parser {
+
+ private Metadata metadata = null;
+
+ private org.apache.tika.parser.Parser parser = null;
+
+ private LinkExtractor extractor = null;
+
+
+ public LinkExtractor getExtractor() {
+ return extractor;
+ }
+
+
+ public void setExtractor(LinkExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+
+ public Parse getParse(InputStream openStream, Link link) {
+ // Init Tika objects
+ parser = new AutoDetectParser();
+ metadata = new Metadata();
+
+ //Init handlers
+ //TODO: Autodetect encoding
+ EchoHandler data = new EchoHandler("UTF-8");
+ extractor.setBase(link);
+
+ TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);
+
+ try {
+ parser.parse(openStream, parallelHandler, metadata);
+ ParseData parseData = new ParseData(extractor.getLinks());
+
+ return new ParseImpl(data.toString(), parseData);
+ } catch (Exception e) {
+ log.error("Parse error." + e);
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ return null;
+ }
+}
Propchange: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
------------------------------------------------------------------------------
svn:executable = *
Added: incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (added)
+++ incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java Sun Nov 9 13:44:30 2008
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import junit.framework.TestCase;
+
+public class TikaHtmlParserTest extends TestCase {
+
+ public void testSomething() throws Exception
+ {
+ // TODO -- test stuff!
+ assertTrue( true );
+ }
+}
\ No newline at end of file
Propchange: incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
------------------------------------------------------------------------------
svn:executable = *
Modified: incubator/droids/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/pom.xml?rev=712559&r1=712558&r2=712559&view=diff
==============================================================================
--- incubator/droids/trunk/pom.xml (original)
+++ incubator/droids/trunk/pom.xml Sun Nov 9 13:44:30 2008
@@ -115,10 +115,13 @@
<artifactId>cocoon-spring-configurator</artifactId>
<version>${spring-conf.version}</version>
</dependency>
+
+ <!-- TESTING -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
+ <scope>test</scope>
</dependency>
</dependencies>
</dependencyManagement>
@@ -184,6 +187,19 @@
</plugins>
</reporting>
+ <repositories>
+ <repository>
+ <id>org.apache.people</id>
+ <name>Apache Snapshot Repository</name>
+ <url>http://people.apache.org/repo/m2-snapshot-repository/</url>
+ <snapshots>
+ <enabled>true</enabled>
+ </snapshots>
+ <releases>
+ <enabled>true</enabled>
+ </releases>
+ </repository>
+ </repositories>
<modules>
<module>droids-norobots</module>
@@ -191,6 +207,7 @@
<module>droids-spring</module>
<module>droids-solr</module>
<module>droids-wicket</module>
+ <module>droids-tika</module>
</modules>
</project>