You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by th...@apache.org on 2009/04/08 04:31:11 UTC
svn commit: r762982 - in /incubator/droids/trunk:
droids-core/src/main/java/org/apache/droids/
droids-core/src/main/java/org/apache/droids/api/
droids-core/src/main/java/org/apache/droids/parse/
droids-core/src/main/java/org/apache/droids/parse/html/ d...
Author: thorsten
Date: Tue Apr 7 21:48:48 2009
New Revision: 762982
URL: http://svn.apache.org/viewvc?rev=762982&view=rev
Log:
DROIDS-44 (DROIDS-11)
# reduce number of depth in the API
# Parse should support holding custom data
Patch submitted by Mingfai Ma. Thank you.
Fixing TikaHtmlParser which the original patch forgot to update.
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/ParseData.java Tue Apr 7 21:48:48 2009
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.droids;
-
-import java.util.Collection;
-import java.util.Collections;
-
-import org.apache.droids.api.Link;
-
-/**
- * The result object that are filled by a parser
- *
- * @version 1.0
- *
- */
-public class ParseData {
- private final Collection<Link> outlinks;
-
- /**
- * Create a new instance of Parse data for the given outlinks
- *
- * @param outlinks
- * the collection of outlinks (used directly)
- */
- public ParseData(Collection<Link> outlinks) {
- if( outlinks == null ) {
- this.outlinks = Collections.emptyList();
- }
- else {
- this.outlinks = outlinks;
- }
- }
-
- /**
- * Get the outlinks of the page.
- *
- * @return all outlinks
- */
- public Collection<Link> getOutlinks() {
- return outlinks;
- }
-}
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parse.java Tue Apr 7 21:48:48 2009
@@ -16,29 +16,35 @@
*/
package org.apache.droids.api;
-import org.apache.droids.ParseData;
+
+import java.util.Collection;
/**
* Wrapper object that encapsulate the result of the parsing of the underlying
* document.
- *
+ *
* @version 1.0
- *
+ *
*/
public interface Parse {
/**
* The textual content of the page. This is indexed, searched, and used when
* generating snippets.
- *
+ *
* @return the textual representation of the underlying page.
*/
String getText();
/**
* Other data extracted from the page.
- *
+ *
* @return the processed parse data.
- * @see ParseData
*/
- ParseData getData();
+ Object getData();
+
+ /**
+ * Outlinks extracted from the Content Entity
+ * @return
+ */
+ Collection<Link> getOutlinks();
}
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java Tue Apr 7 21:48:48 2009
@@ -31,11 +31,11 @@
/**
* Creates the parse for some content.
*
- * @param openStream
+ * @param entity
* the underlying stream we are using
* @param link
* the link that correspond to the stream
* @return the parse object
*/
- Parse getParse(ContentEntity entity, Link link) throws DroidsException, IOException;
+ Parse parse(ContentEntity entity, Link link) throws DroidsException, IOException;
}
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/ParseImpl.java Tue Apr 7 21:48:48 2009
@@ -16,49 +16,57 @@
*/
package org.apache.droids.parse;
-import org.apache.droids.ParseData;
import org.apache.droids.api.Parse;
+import org.apache.droids.api.Link;
+
+import java.util.Collection;
/**
* Default implementation of Parse
- *
+ *
* @version 1.0
- *
*/
public class ParseImpl implements Parse {
+ protected String text;
+ protected Object data;
+ protected Collection<Link> outlinks;
+
+ public ParseImpl() {}
+
+ public ParseImpl(String text, Collection<Link> outlinks) {
+ this.text = text;
+ this.outlinks = outlinks;
+ }
+
+ public ParseImpl(String text, Object data, Collection<Link> outlinks) {
+ this.text = text;
+ this.data = data;
+ this.outlinks = outlinks;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public void setText(String text) {
+ this.text = text;
+ }
+
+ public Object getData() {
+ return data;
+ }
+
+ public void setData(Object data) {
+ this.data = data;
+ }
+
+ public Collection<Link> getOutlinks() {
+ return outlinks;
+ }
+
+ public void setOutlinks(Collection<Link> outlinks) {
+ this.outlinks = outlinks;
+ }
- private ParseData parseData;
-
- private String text;
-
- /**
- * Create a new instance of a Parse for the given text and ParseData
- *
- * @param text
- * the textual representation of the task
- * @param parseData
- */
- public ParseImpl(String text, ParseData parseData) {
- this.text = text;
- this.parseData = parseData;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.droids.api.Parse#getData()
- */
- public ParseData getData() {
- return parseData;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.droids.api.Parse#getText()
- */
- public String getText() {
- return text;
- }
}
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Tue Apr 7 21:48:48 2009
@@ -20,7 +20,6 @@
import java.util.HashMap;
import java.util.Map;
-import org.apache.droids.ParseData;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
@@ -56,7 +55,7 @@
this.elements = elements;
}
- public Parse getParse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
+ public Parse parse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
// setup filter chain
XMLDocumentFilter[] filters = { getRemover() };
// create HTML parser
@@ -71,7 +70,7 @@
} finally {
instream.close();
}
- return new ParseImpl(newLink.getId(), new ParseData(linkExtractor.getLinks()));
+ return new ParseImpl(newLink.getId(),linkExtractor.getLinks());
}
private SAXParser getParser(XMLDocumentFilter[] filters) {
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java Tue Apr 7 21:48:48 2009
@@ -71,8 +71,8 @@
}
}
else {
- Parse parse = parser.getParse(entity, link);
- if( parse.getData() != null ) {
+ Parse parse = parser.parse(entity, link);
+ if( parse.getOutlinks() != null ) {
Collection<Link> outlinks = getFilteredOutlinks( parse );
droid.getQueue().merge( outlinks );
}
@@ -102,7 +102,7 @@
// TODO -- make the hashvalue for Outlink...
Map<String,Link> filtered = new LinkedHashMap<String,Link>();
- for( Link outlink : parse.getData().getOutlinks() ) {
+ for( Link outlink : parse.getOutlinks() ) {
String id = outlink.getId();
if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
filtered.put(id,outlink);
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=762982&r1=762981&r2=762982&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Tue Apr 7 21:48:48 2009
@@ -21,7 +21,6 @@
import java.util.HashMap;
import java.util.Map;
-import org.apache.droids.ParseData;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
@@ -55,7 +54,7 @@
this.elements = elements;
}
- public Parse getParse(ContentEntity entity, Link link) throws IOException, DroidsException {
+ public Parse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
// Init Tika objects
parser = new AutoDetectParser();
metadata = new Metadata();
@@ -72,9 +71,8 @@
InputStream instream = entity.obtainContent();
try {
parser.parse(instream, parallelHandler, metadata);
- ParseData parseData = new ParseData(extractor.getLinks());
- return new ParseImpl(data.toString(), parseData);
+ return new ParseImpl(data.toString(), extractor.getLinks());
} catch (SAXException ex) {
throw new DroidsException("Failure parsing document " + link.getId(), ex);
} catch (TikaException ex) {