You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by th...@apache.org on 2009/04/08 04:31:11 UTC

svn commit: r762982 - in /incubator/droids/trunk: droids-core/src/main/java/org/apache/droids/ droids-core/src/main/java/org/apache/droids/api/ droids-core/src/main/java/org/apache/droids/parse/ droids-core/src/main/java/org/apache/droids/parse/html/ d...

Author: thorsten
Date: Tue Apr  7 21:48:48 2009
New Revision: 762982

URL: http://svn.apache.org/viewvc?rev=762982&view=rev
Log:
DROIDS-44 (DROIDS-11)
# reduce number of depth in the API
# Parse should support holding custom data

Patch submitted by Mingfai Ma. Thank you.

Fixing TikaHtmlParser which the original patch forgot to update.

Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java Tue Apr  7 21:48:48 2009
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.droids;
-
-import java.util.Collection;
-import java.util.Collections;
-
-import org.apache.droids.api.Link;
-
-/**
- * The result object that are filled by a parser
- * 
- * @version 1.0
- * 
- */
-public class ParseData {
-  private final Collection<Link> outlinks;
-
-  /**
-   * Create a new instance of Parse data for the given outlinks
-   * 
-   * @param outlinks
-   *                the collection of outlinks (used directly)
-   */
-  public ParseData(Collection<Link> outlinks) {
-    if( outlinks == null ) {
-      this.outlinks = Collections.emptyList();
-    }
-    else {
-      this.outlinks = outlinks;
-    }
-  }
-
-  /**
-   * Get the outlinks of the page.
-   * 
-   * @return all outlinks 
-   */
-  public Collection<Link> getOutlinks() {
-    return outlinks;
-  }
-}

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java Tue Apr  7 21:48:48 2009
@@ -16,29 +16,35 @@
  */
 package org.apache.droids.api;
 
-import org.apache.droids.ParseData;
+
+import java.util.Collection;
 
 /**
  * Wrapper object that encapsulate the result of the parsing of the underlying
  * document.
- * 
+ *
  * @version 1.0
- * 
+ *
  */
 public interface Parse {
   /**
    * The textual content of the page. This is indexed, searched, and used when
    * generating snippets.
-   * 
+   *
    * @return the textual representation of the underlying page.
    */
   String getText();
 
   /**
    * Other data extracted from the page.
-   * 
+   *
    * @return the processed parse data.
-   * @see ParseData
    */
-  ParseData getData();
+  Object getData();
+
+    /**
+     * Outlinks extracted from the Content Entity
+     * @return
+     */
+  Collection<Link> getOutlinks();
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java Tue Apr  7 21:48:48 2009
@@ -31,11 +31,11 @@
   /**
    * Creates the parse for some content.
    * 
-   * @param openStream
+   * @param entity
    *                the underlying stream we are using
    * @param link
    *                the link that correspond to the stream
    * @return the parse object
    */
-  Parse getParse(ContentEntity entity, Link link) throws DroidsException, IOException;
+  Parse parse(ContentEntity entity, Link link) throws DroidsException, IOException;
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java Tue Apr  7 21:48:48 2009
@@ -16,49 +16,57 @@
  */
 package org.apache.droids.parse;
 
-import org.apache.droids.ParseData;
 import org.apache.droids.api.Parse;
+import org.apache.droids.api.Link;
+
+import java.util.Collection;
 
 /**
  * Default implementation of Parse
- * 
+ *
  * @version 1.0
- * 
  */
 public class ParseImpl implements Parse {
+    protected String text;
+    protected Object data;
+    protected Collection<Link> outlinks;
+
+    public ParseImpl() {}
+
+    public ParseImpl(String text, Collection<Link> outlinks) {
+        this.text = text;
+        this.outlinks = outlinks;
+    }
+
+    public ParseImpl(String text, Object data, Collection<Link> outlinks) {
+        this.text = text;
+        this.data = data;
+        this.outlinks = outlinks;
+    }
+
+    public String getText() {
+        return text;
+    }
+
+    public void setText(String text) {
+        this.text = text;
+    }
+
+    public Object getData() {
+        return data;
+    }
+
+    public void setData(Object data) {
+        this.data = data;
+    }
+
+    public Collection<Link> getOutlinks() {
+        return outlinks;
+    }
+
+    public void setOutlinks(Collection<Link> outlinks) {
+        this.outlinks = outlinks;
+    }
 
-  private ParseData parseData;
-
-  private String text;
-
-  /**
-   * Create a new instance of a Parse for the given text and ParseData
-   * 
-   * @param text
-   *                the textual representation of the task
-   * @param parseData
-   */
-  public ParseImpl(String text, ParseData parseData) {
-    this.text = text;
-    this.parseData = parseData;
-  }
-
-  /*
-   * (non-Javadoc)
-   * 
-   * @see org.apache.droids.api.Parse#getData()
-   */
-  public ParseData getData() {
-    return parseData;
-  }
-
-  /*
-   * (non-Javadoc)
-   * 
-   * @see org.apache.droids.api.Parse#getText()
-   */
-  public String getText() {
-    return text;
-  }
 
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Tue Apr  7 21:48:48 2009
@@ -20,7 +20,6 @@
 import java.util.HashMap;
 import java.util.Map;
 
-import org.apache.droids.ParseData;
 import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
@@ -56,7 +55,7 @@
     this.elements = elements;
   }
 
-  public Parse getParse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
+  public Parse parse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
     // setup filter chain
     XMLDocumentFilter[] filters = { getRemover() };
     // create HTML parser
@@ -71,7 +70,7 @@
     } finally {
       instream.close();
     }
-    return new ParseImpl(newLink.getId(), new ParseData(linkExtractor.getLinks()));
+    return new ParseImpl(newLink.getId(),linkExtractor.getLinks());
   }
 
   private SAXParser getParser(XMLDocumentFilter[] filters) {

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java Tue Apr  7 21:48:48 2009
@@ -71,8 +71,8 @@
             }
           }
           else {
-            Parse parse = parser.getParse(entity, link);
-            if( parse.getData() != null ) {
+            Parse parse = parser.parse(entity, link);
+            if( parse.getOutlinks() != null ) {
               Collection<Link> outlinks = getFilteredOutlinks( parse );
               droid.getQueue().merge( outlinks );
             }
@@ -102,7 +102,7 @@
    
     // TODO -- make the hashvalue for Outlink...
     Map<String,Link> filtered = new LinkedHashMap<String,Link>();
-    for( Link outlink : parse.getData().getOutlinks() ) {
+    for( Link outlink : parse.getOutlinks() ) {
       String id = outlink.getId();
       if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
         filtered.put(id,outlink);

Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Tue Apr  7 21:48:48 2009
@@ -21,7 +21,6 @@
 import java.util.HashMap;
 import java.util.Map;
 
-import org.apache.droids.ParseData;
 import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
@@ -55,7 +54,7 @@
     this.elements = elements;
   }
 
-  public Parse getParse(ContentEntity entity, Link link) throws IOException, DroidsException {
+  public Parse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
     // Init Tika objects
     parser = new AutoDetectParser();
     metadata = new Metadata();
@@ -72,9 +71,8 @@
     InputStream instream = entity.obtainContent();
     try {
       parser.parse(instream, parallelHandler, metadata);
-      ParseData parseData = new ParseData(extractor.getLinks());
       
-      return new ParseImpl(data.toString(), parseData);
+      return new ParseImpl(data.toString(), extractor.getLinks());
     } catch (SAXException ex) {
       throw new DroidsException("Failure parsing document " + link.getId(), ex);
     } catch (TikaException ex) {