You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ry...@apache.org on 2008/11/09 21:44:31 UTC

svn commit: r712559 - in /incubator/droids/trunk: ./ droids-spring/conf/ droids-tika/ droids-tika/src/ droids-tika/src/main/ droids-tika/src/main/java/ droids-tika/src/main/java/org/ droids-tika/src/main/java/org/apache/ droids-tika/src/main/java/org/a...

Author: ryan
Date: Sun Nov  9 13:44:30 2008
New Revision: 712559

URL: http://svn.apache.org/viewvc?rev=712559&view=rev
Log:
DROIDS-8:  Adding a tika module

Added:
    incubator/droids/trunk/droids-tika/
    incubator/droids/trunk/droids-tika/pom.xml   (with props)
    incubator/droids/trunk/droids-tika/src/
    incubator/droids/trunk/droids-tika/src/main/
    incubator/droids/trunk/droids-tika/src/main/java/
    incubator/droids/trunk/droids-tika/src/main/java/org/
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java   (with props)
    incubator/droids/trunk/droids-tika/src/main/resources/
    incubator/droids/trunk/droids-tika/src/test/
    incubator/droids/trunk/droids-tika/src/test/java/
    incubator/droids/trunk/droids-tika/src/test/java/org/
    incubator/droids/trunk/droids-tika/src/test/java/org/apache/
    incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/
    incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/
    incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java   (with props)
    incubator/droids/trunk/droids-tika/src/test/resources/
Removed:
    incubator/droids/trunk/droids-spring/conf/
Modified:
    incubator/droids/trunk/pom.xml

Added: incubator/droids/trunk/droids-tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/pom.xml?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/pom.xml (added)
+++ incubator/droids/trunk/droids-tika/pom.xml Sun Nov  9 13:44:30 2008
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <artifactId>droids</artifactId>
+    <groupId>org.apache.droids</groupId>
+    <version>0.1-incubating-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <artifactId>droids-tika</artifactId>
+  <name>Droids Tika</name>
+  <inceptionYear>2007</inceptionYear>
+  <description>
+   Droids Tika Parser
+  </description>
+  <packaging>jar</packaging>  
+
+  <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.droids</groupId>
+      <artifactId>droids-core</artifactId>
+      <version>${pom.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+			<artifactId>tika</artifactId>
+			<version>0.2-SNAPSHOT</version>
+    </dependency>
+  </dependencies>
+
+</project>

Propchange: incubator/droids/trunk/droids-tika/pom.xml
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/EchoHandler.java Sun Nov  9 13:44:30 2008
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import java.io.ByteArrayInputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Handler that write the events in a StringBuffer to save a XML
+ * representation.
+ */
+public class EchoHandler extends DefaultHandler {
+  protected String encoding = null;
+
+  private StringBuffer xmlBuffer = null;
+
+  private ByteArrayInputStream inputStream = null;
+
+  private byte[] bytes;
+
+  /**
+   * Set the encoding for the XML output.
+   * 
+   * @param encoding
+   */
+  public EchoHandler(String encoding) {
+    if (null != encoding & !" ".equals(encoding)) {
+      this.encoding = encoding;
+    } else {
+      this.encoding = "UTF-8";
+    }
+  }
+
+  /*
+   * Receive notification of the beginning of a document. (non-Javadoc)
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#startDocument()
+   */
+  public void startDocument() throws SAXException {
+    xmlBuffer = new StringBuffer();
+    xmlBuffer.append("<?xml version=\"1.0\" encoding=\"" + encoding + "\"?>\r\n");
+  }
+
+  /*
+   * Receive notification of the end of a document. (non-Javadoc)
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#endDocument()
+   */
+  public void endDocument() throws SAXException {
+    try {
+      setResult();
+    } catch (UnsupportedEncodingException e) {
+      throw new SAXException(e);
+    }
+  }
+
+  /*
+   * Receive notification of the beginning of an element.
+   * 
+   * @param uri The Namespace URI, or the empty string if the element has no
+   * Namespace URI or if Namespace processing is not being performed. @param loc
+   * The local name (without prefix), or the empty string if Namespace
+   * processing is not being performed. @param raw The raw XML 1.0 name (with
+   * prefix), or the empty string if raw names are not available. @param atts
+   * The attributes attached to the element. If there are no attributes, it
+   * shall be an empty Attributes object.
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String,
+   *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
+   */
+  public void startElement(String uri, String loc, String raw, Attributes atts)
+      throws SAXException {
+    xmlBuffer.append("\r\n<" + raw);
+    for (int i = 0; i < atts.getLength(); i++) {
+      xmlBuffer.append(" ");
+      xmlBuffer.append(atts.getQName(i));
+      xmlBuffer.append("=\"");
+      String value = atts.getValue(i);
+
+      xmlBuffer.append(value);
+      xmlBuffer.append("\"");
+    }
+    xmlBuffer.append(">\r\n");
+  }
+
+  /*
+   * Receive notification of the end of an element.
+   * 
+   * @param uri The Namespace URI, or the empty string if the element has no
+   * Namespace URI or if Namespace processing is not being performed. @param loc
+   * The local name (without prefix), or the empty string if Namespace
+   * processing is not being performed. @param raw The raw XML 1.0 name (with
+   * prefix), or the empty string if raw names are not available.
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String,
+   *      java.lang.String, java.lang.String)
+   */
+  public void endElement(String uri, String loc, String raw)
+      throws SAXException {
+    xmlBuffer.append("\r\n</" + raw + ">\r\n");
+  }
+
+  /*
+   * Receive notification of character data.
+   * 
+   * @param ch The characters from the XML document. @param start The start
+   * position in the array. @param length The number of characters to read from
+   * the array.
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#characters(char[], int, int)
+   */
+  public void characters(char ch[], int start, int length) throws SAXException {
+    for (int i = 0; i < length; i++) {
+      char c = ch[start + i];
+      if (c == '&') {
+        xmlBuffer.append("&amp;");
+      } else if (c == '<') {
+        xmlBuffer.append("&lt;");
+      } else if (c == '>') {
+        xmlBuffer.append("&gt;");
+      } else {
+        xmlBuffer.append(c);
+      }
+    }
+  }
+
+  /*
+   * Receive notification of ignorable whitespace in element content.
+   * 
+   * @param ch The characters from the XML document. @param start The start
+   * position in the array. @param length The number of characters to read from
+   * the array.
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#ignorableWhitespace(char[], int,
+   *      int)
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+      throws SAXException {
+    this.characters(ch, start, length);
+  }
+
+  /*
+   * Receive notification of a processing instruction.
+   * 
+   * @param target The processing instruction target. @param data The processing
+   * instruction data, or null if none was supplied.
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#processingInstruction(java.lang.String,
+   *      java.lang.String)
+   */
+  public void processingInstruction(String target, String data)
+      throws SAXException {
+    xmlBuffer.append("<?" + target + " " + data + "?>");
+  }
+
+  /*
+   * Receive notification of a skipped entity.
+   * 
+   * @param name The name of the skipped entity. If it is a parameter entity,
+   * the name will begin with '%'.
+   * 
+   * @see org.xml.sax.helpers.DefaultHandler#skippedEntity(java.lang.String)
+   */
+  public void skippedEntity(String name) throws SAXException {
+    xmlBuffer.append("&" + name + ";");
+  }
+
+  private void setResult() throws UnsupportedEncodingException {
+    try {
+      this.bytes=xmlBuffer.toString().getBytes(encoding);
+      this.inputStream = new ByteArrayInputStream(this.bytes);
+    } catch (UnsupportedEncodingException e) {
+      throw new UnsupportedEncodingException();
+    }
+  }
+
+  /**
+   * Return the underlying input stream
+   * 
+   * @return the input stream
+   */
+  public ByteArrayInputStream getInputStream() {
+    return this.inputStream;
+  }
+  
+  /**
+   * Return the underlying input stream
+   * 
+   * @return the input stream
+   */
+  public byte[] getBytes() {
+    return this.bytes;
+  }
+
+  /**
+   * String representation of the data.
+   */
+  @Override
+  public String toString() {
+    return xmlBuffer.toString();
+  }
+}
\ No newline at end of file

Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java Sun Nov  9 13:44:30 2008
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.LinkTask;
+import org.apache.droids.api.Link;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class LinkExtractor extends DefaultHandler {
+
+  protected final Log log = LogFactory.getLog(this.getClass());
+  
+  private Collection<Link> links = new ArrayList<Link>();;
+  
+  private Map<String,String> elements;
+  
+  private Link base = null;
+
+  private Set<String> history = new HashSet<String>();
+  
+  @Override
+  public void startElement(String uri, String loc, String raw, Attributes att)
+      throws SAXException {
+    Iterator<String> it = elements.keySet().iterator();
+    String elem, linkAtt;
+    while(it.hasNext())
+    {
+      elem = it.next();
+      linkAtt = elements.get(elem);
+      if (elem.equalsIgnoreCase(loc) && att.getValue(linkAtt) != null) {
+        
+        String link = att.getValue(linkAtt);
+        log.debug("Found element: " + elem + " with link: " + link);
+        //TODO: Buscar una forma genérica
+        //Matcher match = Pattern.compile("^(http://|/|\\.)").matcher(link);
+        if(!link.startsWith("#") && !link.equals("") && !link.startsWith("mailto"))
+          addOutlinkURI(link);
+      }
+    }
+    super.startElement(uri, loc, raw, att);
+  }
+
+  @Override
+  public void endDocument() throws SAXException {
+    super.endDocument();
+    history = null;
+  }
+
+  public void addOutlinkURI(String target) {
+    URI uri = null;
+    URI from = base.getURI();
+    try {
+      String newUrl = "";
+      if (target.startsWith("/")) {
+        newUrl = from.getScheme() + "://" + from.getHost();
+        if (from.getPort() > -1) {
+          newUrl += ":" + from.getPort();
+        }
+        newUrl += target;
+      } else if (!target.toLowerCase().startsWith("javascript")) {
+        newUrl = new URL(from.toURL(), target).toString();
+      }
+      if (!newUrl.equals("")) {
+        String aux = target.contains(":/") ? target : newUrl;
+        // TODO:
+        aux = aux.split("#")[0];
+        uri = new URI(aux);
+        if(history == null)
+          history = new HashSet<String>();
+        if(links == null)
+          links = new ArrayList<Link>();
+        if (history.add(aux)) {
+          // TODO? aux??
+          links.add(new LinkTask(base, uri, base.getDepth()+1 ));
+        }
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+  
+  public void setBase(Link base) {
+    this.base = base;
+  }
+
+  public Collection<Link> getLinks() {
+    return links;
+  }
+
+  public Map<String, String> getElements() {
+    return elements;
+  }
+
+  public void setElements(Map<String, String> elements) {
+    this.elements = elements;
+  }
+}

Added: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (added)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Sun Nov  9 13:44:30 2008
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import java.io.InputStream;
+
+import org.apache.droids.ParseData;
+import org.apache.droids.api.Link;
+import org.apache.droids.api.Parse;
+import org.apache.droids.api.Parser;
+import org.apache.droids.helper.Loggable;
+import org.apache.droids.parse.ParseImpl;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.TeeContentHandler;
+
+public class TikaHtmlParser extends Loggable implements Parser {
+
+  private Metadata metadata = null;
+
+  private org.apache.tika.parser.Parser parser = null;
+
+  private LinkExtractor extractor = null;
+
+
+  public LinkExtractor getExtractor() {
+    return extractor;
+  }
+
+
+  public void setExtractor(LinkExtractor extractor) {
+    this.extractor = extractor;
+  }
+
+
+  public Parse getParse(InputStream openStream, Link link) {
+    // Init Tika objects
+    parser = new AutoDetectParser();
+    metadata = new Metadata();
+    
+    //Init handlers
+    //TODO: Autodetect encoding
+    EchoHandler data = new EchoHandler("UTF-8"); 
+    extractor.setBase(link);
+    
+    TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);
+
+    try {
+      parser.parse(openStream, parallelHandler, metadata);
+      ParseData parseData = new ParseData(extractor.getLinks());
+      
+      return new ParseImpl(data.toString(), parseData);
+    } catch (Exception e) {
+      log.error("Parse error." + e);
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+    } 
+    
+    return null;
+  }
+}

Propchange: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java?rev=712559&view=auto
==============================================================================
--- incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (added)
+++ incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java Sun Nov  9 13:44:30 2008
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.tika;
+
+import junit.framework.TestCase;
+
+public class TikaHtmlParserTest extends TestCase {
+ 
+  public void testSomething() throws Exception
+  {
+    // TODO -- test stuff!
+    assertTrue( true );
+  }
+}
\ No newline at end of file

Propchange: incubator/droids/trunk/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: incubator/droids/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/pom.xml?rev=712559&r1=712558&r2=712559&view=diff
==============================================================================
--- incubator/droids/trunk/pom.xml (original)
+++ incubator/droids/trunk/pom.xml Sun Nov  9 13:44:30 2008
@@ -115,10 +115,13 @@
         <artifactId>cocoon-spring-configurator</artifactId>
         <version>${spring-conf.version}</version>
       </dependency>
+      
+      <!-- TESTING -->
       <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
         <version>${junit.version}</version>
+        <scope>test</scope>
       </dependency>
     </dependencies>
   </dependencyManagement>
@@ -184,6 +187,19 @@
 		</plugins>
 	</reporting>
 
+	<repositories>
+		<repository>
+			<id>org.apache.people</id>
+			<name>Apache Snapshot Repository</name>
+			<url>http://people.apache.org/repo/m2-snapshot-repository/</url>
+			<snapshots>
+				<enabled>true</enabled>
+			</snapshots>
+			<releases>
+				<enabled>true</enabled>
+			</releases>
+		</repository>
+	</repositories>
 
   <modules>
     <module>droids-norobots</module>
@@ -191,6 +207,7 @@
     <module>droids-spring</module>
     <module>droids-solr</module>
     <module>droids-wicket</module>
+    <module>droids-tika</module>
   </modules>
 
 </project>