You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by to...@apache.org on 2012/12/18 08:48:00 UTC
svn commit: r1423339 [5/5] - in /incubator/droids/branches/0.2.x-cleanup:
droids-crawler/ droids-crawler/src/main/java/org/apache/droids/crawler/
droids-crawler/src/main/java/org/apache/droids/protocol/http/
droids-crawler/src/test/java/org/apache/droi...
Modified: incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java Tue Dec 18 08:47:39 2012
@@ -36,63 +36,63 @@ import org.springframework.context.suppo
public class TestSimpleDroid {
- protected LocalHttpServer testserver;
+ protected LocalHttpServer testserver;
- private final static ApplicationContext context = new ClassPathXmlApplicationContext(
- "classpath:/droids-core-test-context.xml");
+ private final static ApplicationContext context = new ClassPathXmlApplicationContext(
+ "classpath:/droids-core-test-context.xml");
- private DroidsConfig droidsConfig = null;
+ private DroidsConfig droidsConfig = null;
- @Before
- public void setUp() throws Exception {
- this.droidsConfig = (DroidsConfig) TestSimpleDroid.context
- .getBean("org.apache.droids.dynamic.DroidsConfig");
- this.testserver = new LocalHttpServer();
- }
-
- @Test
- public void testReportCrawlingDroid() throws Exception {
- this.testserver.register("*", new ResourceHandler());
- this.testserver.start();
-
- String baseURI = "http:/" + this.testserver.getServiceAddress();
- String targetURI = baseURI + "/start_html";
-
- Droid<Link> droid = createSimpleReportCrawlingDroid(targetURI);
-
- droid.init();
- droid.start();
- droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
-
- Assert.assertFalse(ReportHandler.getReport().isEmpty());
- Assert.assertEquals(5, ReportHandler.getReport().size());
- Assert.assertTrue(ReportHandler.getReport().contains(
- baseURI + "/start_html"));
- Assert.assertTrue(ReportHandler.getReport().contains(
- baseURI + "/page1_html"));
- Assert.assertTrue(ReportHandler.getReport().contains(
- baseURI + "/page2_html"));
- Assert.assertTrue(ReportHandler.getReport().contains(
- baseURI + "/page3_html"));
- Assert.assertTrue(ReportHandler.getReport().contains(
- baseURI + "/page4_html"));
-
- ReportHandler.recycle();
- }
-
- private Droid<Link> createSimpleReportCrawlingDroid(final String targetURI) {
- Droid<Link> droid = this.droidsConfig.getDroid("report");
-
- Assert.assertFalse("Droid is null.", droid == null);
- Assert.assertTrue(
- "The test droid must be an instance of ReportCrawlingDroid",
- droid instanceof ReportCrawlingDroid);
-
- final List<String> locations = new ArrayList<String>();
- locations.add(targetURI);
- ((CrawlingDroid) droid).setInitialLocations(locations);
+ @Before
+ public void setUp() throws Exception {
+ this.droidsConfig = (DroidsConfig) TestSimpleDroid.context
+ .getBean("org.apache.droids.dynamic.DroidsConfig");
+ this.testserver = new LocalHttpServer();
+ }
+
+ @Test
+ public void testReportCrawlingDroid() throws Exception {
+ this.testserver.register("*", new ResourceHandler());
+ this.testserver.start();
+
+ String baseURI = "http:/" + this.testserver.getServiceAddress();
+ String targetURI = baseURI + "/start_html";
+
+ Droid<Link> droid = createSimpleReportCrawlingDroid(targetURI);
+
+ droid.init();
+ droid.start();
+ droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
+
+ Assert.assertFalse(ReportHandler.getReport().isEmpty());
+ Assert.assertEquals(5, ReportHandler.getReport().size());
+ Assert.assertTrue(ReportHandler.getReport().contains(
+ baseURI + "/start_html"));
+ Assert.assertTrue(ReportHandler.getReport().contains(
+ baseURI + "/page1_html"));
+ Assert.assertTrue(ReportHandler.getReport().contains(
+ baseURI + "/page2_html"));
+ Assert.assertTrue(ReportHandler.getReport().contains(
+ baseURI + "/page3_html"));
+ Assert.assertTrue(ReportHandler.getReport().contains(
+ baseURI + "/page4_html"));
+
+ ReportHandler.recycle();
+ }
+
+ private Droid<Link> createSimpleReportCrawlingDroid(final String targetURI) {
+ Droid<Link> droid = this.droidsConfig.getDroid("report");
+
+ Assert.assertFalse("Droid is null.", droid == null);
+ Assert.assertTrue(
+ "The test droid must be an instance of ReportCrawlingDroid",
+ droid instanceof ReportCrawlingDroid);
+
+ final List<String> locations = new ArrayList<String>();
+ locations.add(targetURI);
+ ((CrawlingDroid) droid).setInitialLocations(locations);
- return droid;
- }
+ return droid;
+ }
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml Tue Dec 18 08:47:39 2012
@@ -15,105 +15,105 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
- <!--
- Using your own context
- +++++++++++++++++++++++++
- The easiest way is to
- a) create a droids-your-context.xml
- b) add:
- <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
- c) implement your own beans which will override the imported ones
- d) Call the ant target like:
- ant droids.crawl default -Ddroids.spring.context=PATH/droids-your-context.xml
- -->
+<!--
+ Using your own context
+ +++++++++++++++++++++++++
+ The easiest way is to
+ a) create a droids-your-context.xml
+ b) add:
+ <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
+ c) implement your own beans which will override the imported ones
+ d) Call the ant target like:
+ ant droids.crawl default -Ddroids.spring.context=PATH/droids-your-context.xml
+-->
<beans xmlns="http://www.springframework.org/schema/beans"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xmlns:configurator="http://cocoon.apache.org/schema/configurator"
- xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:configurator="http://cocoon.apache.org/schema/configurator"
+ xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.1.0.xsd">
-
- <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
-
- <!-- configuration properties file -->
- <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
- <property name="locations" value="classpath:/droids-core.properties"/>
- </bean>
-
- <bean name="taskExceptionHandler"
- class="org.apache.droids.impl.DefaultTaskExceptionHandler">
- </bean>
-
- <bean name="taskMaster"
- class="org.apache.droids.impl.MultiThreadedTaskMaster">
- <property name="exceptionHandler" ref="taskExceptionHandler" />
- <property name="delayTimer" ref="org.apache.droids.delay.SimpleDelayTimer"/>
- <!--<property name="maxThreads" value="${droids.maxThreads}"/>-->
- </bean>
-
- <!-- Droids -->
- <bean name="org.apache.droids.api.Droid/report"
- class="org.apache.droids.robot.crawler.ReportCrawlingDroid">
- <constructor-arg ref="java.util.LinkedList" />
- <constructor-arg ref="taskMaster" />
-
-
- <property name="protocolFactory" ref="org.apache.droids.helper.factories.ProtocolFactory"/>
- <property name="parserFactory" ref="org.apache.droids.helper.factories.ParserFactory"/>
- <property name="filtersFactory" ref="org.apache.droids.helper.factories.FilterFactory"/>
- </bean>
- <!-- Queue -->
- <bean id="java.util.LinkedList"
- class="java.util.LinkedList">
- </bean>
- <!-- Protocol -->
- <bean
- name="org.apache.droids.api.Protocol/http"
- class="org.apache.droids.protocol.http.HttpProtocol" scope="singleton">
- <property name="userAgent" value="DROIDS-crawler-x-m01y08"/>
- <property name="forceAllow" value="${droids.protocol.http.force}"/>
- </bean>
- <bean name="org.apache.droids.api.Protocol/file"
- class="org.apache.droids.protocol.file.FileProtocol" scope="singleton"/>
- <!-- Parser -->
- <bean
- name="text/html"
- class="org.apache.droids.parse.html.HtmlParser">
- <property name="elements">
- <map>
- <entry key="a" value="href"/>
- <entry key="link" value="href"/>
- <entry key="img" value="src"/>
- <entry key="script" value="src"/>
- </map>
- </property>
- </bean>
- <!-- Filter -->
- <bean
- name="org.apache.droids.api.URLFilter/org.apache.droids.net.RegexURLFilter"
- class="org.apache.droids.net.RegexURLFilter">
- <property name="file" value="${droids.filter.regex}"/>
- </bean>
- <!-- Handler -->
- <bean
- name="org.apache.droids.api.Handler/org.apache.droids.handle.SysoutHandler"
- class="org.apache.droids.handle.SysoutHandler"/>
- <bean
- name="org.apache.droids.api.Handler/org.apache.droids.handle.SaveHandler"
- class="org.apache.droids.handle.SaveHandler">
- <property name="saveContentHandlerStrategy"
- ref="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy" />
- </bean>
- <bean
- name="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"
- class="org.apache.droids.handle.DefaultSaveContentHandlerStrategy">
- <property name="includeHost" value="true" />
- <property name="outputDir" value="tmp/" />
- </bean>
-
-
- <bean
- name="org.apache.droids.delay.SimpleDelayTimer"
- class="org.apache.droids.delay.SimpleDelayTimer">
- <property name="delayMillis" value="${droids.delay.request}"/>
- </bean>
+
+ <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
+
+ <!-- configuration properties file -->
+ <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
+ <property name="locations" value="classpath:/droids-core.properties"/>
+ </bean>
+
+ <bean name="taskExceptionHandler"
+ class="org.apache.droids.impl.DefaultTaskExceptionHandler">
+ </bean>
+
+ <bean name="taskMaster"
+ class="org.apache.droids.impl.MultiThreadedTaskMaster">
+ <property name="exceptionHandler" ref="taskExceptionHandler"/>
+ <property name="delayTimer" ref="org.apache.droids.delay.SimpleDelayTimer"/>
+ <!--<property name="maxThreads" value="${droids.maxThreads}"/>-->
+ </bean>
+
+ <!-- Droids -->
+ <bean name="org.apache.droids.api.Droid/report"
+ class="org.apache.droids.robot.crawler.ReportCrawlingDroid">
+ <constructor-arg ref="java.util.LinkedList"/>
+ <constructor-arg ref="taskMaster"/>
+
+
+ <property name="protocolFactory" ref="org.apache.droids.helper.factories.ProtocolFactory"/>
+ <property name="parserFactory" ref="org.apache.droids.helper.factories.ParserFactory"/>
+ <property name="filtersFactory" ref="org.apache.droids.helper.factories.FilterFactory"/>
+ </bean>
+ <!-- Queue -->
+ <bean id="java.util.LinkedList"
+ class="java.util.LinkedList">
+ </bean>
+ <!-- Protocol -->
+ <bean
+ name="org.apache.droids.api.Protocol/http"
+ class="org.apache.droids.protocol.http.HttpProtocol" scope="singleton">
+ <property name="userAgent" value="DROIDS-crawler-x-m01y08"/>
+ <property name="forceAllow" value="${droids.protocol.http.force}"/>
+ </bean>
+ <bean name="org.apache.droids.api.Protocol/file"
+ class="org.apache.droids.protocol.file.FileProtocol" scope="singleton"/>
+ <!-- Parser -->
+ <bean
+ name="text/html"
+ class="org.apache.droids.parse.html.HtmlParser">
+ <property name="elements">
+ <map>
+ <entry key="a" value="href"/>
+ <entry key="link" value="href"/>
+ <entry key="img" value="src"/>
+ <entry key="script" value="src"/>
+ </map>
+ </property>
+ </bean>
+ <!-- Filter -->
+ <bean
+ name="org.apache.droids.api.URLFilter/org.apache.droids.net.RegexURLFilter"
+ class="org.apache.droids.net.RegexURLFilter">
+ <property name="file" value="${droids.filter.regex}"/>
+ </bean>
+ <!-- Handler -->
+ <bean
+ name="org.apache.droids.api.Handler/org.apache.droids.handle.SysoutHandler"
+ class="org.apache.droids.handle.SysoutHandler"/>
+ <bean
+ name="org.apache.droids.api.Handler/org.apache.droids.handle.SaveHandler"
+ class="org.apache.droids.handle.SaveHandler">
+ <property name="saveContentHandlerStrategy"
+ ref="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"/>
+ </bean>
+ <bean
+ name="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"
+ class="org.apache.droids.handle.DefaultSaveContentHandlerStrategy">
+ <property name="includeHost" value="true"/>
+ <property name="outputDir" value="tmp/"/>
+ </bean>
+
+
+ <bean
+ name="org.apache.droids.delay.SimpleDelayTimer"
+ class="org.apache.droids.delay.SimpleDelayTimer">
+ <property name="delayMillis" value="${droids.delay.request}"/>
+ </bean>
</beans>
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml Tue Dec 18 08:47:39 2012
@@ -15,53 +15,54 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <artifactId>droids</artifactId>
- <groupId>org.apache.droids</groupId>
- <version>0.3.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
- <artifactId>droids-tika</artifactId>
- <name>Apache Droids Tika</name>
- <inceptionYear>2007</inceptionYear>
- <description>Apache Droids Tika Parser</description>
- <packaging>jar</packaging>
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <artifactId>droids</artifactId>
+ <groupId>org.apache.droids</groupId>
+ <version>0.3.0-incubating-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>droids-tika</artifactId>
+ <name>Apache Droids Tika</name>
+ <inceptionYear>2007</inceptionYear>
+ <description>Apache Droids Tika Parser</description>
+ <packaging>jar</packaging>
- <properties>
- <tika-release-version>1.1</tika-release-version>
- </properties>
+ <properties>
+ <tika-release-version>1.1</tika-release-version>
+ </properties>
- <dependencies>
- <dependency>
- <groupId>org.apache.droids</groupId>
- <artifactId>droids-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>${tika-release-version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>${tika-release-version}</version>
- <exclusions>
- <exclusion>
- <artifactId>commons-logging</artifactId>
- <groupId>commons-logging</groupId>
- </exclusion>
- </exclusions>
- </dependency>
- <!-- test dependencies -->
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>${junit.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.droids</groupId>
+ <artifactId>droids-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${tika-release-version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${tika-release-version}</version>
+ <exclusions>
+ <exclusion>
+ <artifactId>commons-logging</artifactId>
+ <groupId>commons-logging</groupId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <!-- test dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${junit.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
</project>
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java Tue Dec 18 08:47:39 2012
@@ -54,76 +54,75 @@ import org.xml.sax.SAXException;
/**
* Parses documents using Tika.
* Any document type that Tika can handle, can be handled by this class,
- * including HTML.
- *
+ * including HTML.
*/
public class TikaDocumentParser implements TikaParser {
- protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class);
-
- @Override
- public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
- IOException {
- // Init Tika objects
- org.apache.tika.parser.Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- String charset = entity.getCharset();
- if (charset == null) {
- charset = "UTF-8";
- }
-
- StringWriter dataBuffer = new StringWriter();
- StringWriter bodyBuffer = new StringWriter();
- StringWriter mainContentBuffer = new StringWriter();
-
- SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
- TransformerHandler xmlHandler;
- try {
- xmlHandler = factory.newTransformerHandler();
- } catch (TransformerConfigurationException e) {
- throw new DroidsException(e);
- }
- xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- xmlHandler.setResult(new StreamResult(dataBuffer));
-
- BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
- BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
- LinkContentHandler linkHandler = new LinkContentHandler();
-
- TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );
-
- InputStream instream = entity.obtainContent();
- try {
- parser.parse(instream, parallelHandler, metadata, new ParseContext());
-
- ArrayList<Link> extractedTasks = new ArrayList<Link>();
- int depth = task.getDepth() + 1;
- if (task instanceof LinkTask) {
- for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
- try {
- URI uri = new URI(tikaLink.getUri());
- // Test to see if the scheme is empty
- // This would indicate a relative URL, so resolve it against the task URI
- if(uri.getScheme() == null) {
- uri = ((Link) task).getURI().resolve(uri);
+ protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class);
+
+ @Override
+ public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
+ IOException {
+ // Init Tika objects
+ org.apache.tika.parser.Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ String charset = entity.getCharset();
+ if (charset == null) {
+ charset = "UTF-8";
+ }
+
+ StringWriter dataBuffer = new StringWriter();
+ StringWriter bodyBuffer = new StringWriter();
+ StringWriter mainContentBuffer = new StringWriter();
+
+ SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+ TransformerHandler xmlHandler;
+ try {
+ xmlHandler = factory.newTransformerHandler();
+ } catch (TransformerConfigurationException e) {
+ throw new DroidsException(e);
+ }
+ xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ xmlHandler.setResult(new StreamResult(dataBuffer));
+
+ BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
+ BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+ LinkContentHandler linkHandler = new LinkContentHandler();
+
+ TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler);
+
+ InputStream instream = entity.obtainContent();
+ try {
+ parser.parse(instream, parallelHandler, metadata, new ParseContext());
+
+ ArrayList<Link> extractedTasks = new ArrayList<Link>();
+ int depth = task.getDepth() + 1;
+ if (task instanceof LinkTask) {
+ for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ try {
+ URI uri = new URI(tikaLink.getUri());
+ // Test to see if the scheme is empty
+ // This would indicate a relative URL, so resolve it against the task URI
+ if (uri.getScheme() == null) {
+ uri = ((Link) task).getURI().resolve(uri);
+ }
+ extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText()));
+ } catch (URISyntaxException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("URI not valid: " + tikaLink.getUri());
+ }
+ }
+ }
}
- extractedTasks.add(new LinkTask((Link)task, uri, depth, tikaLink.getText()));
- } catch (URISyntaxException e) {
- if(LOG.isWarnEnabled()) {
- LOG.warn("URI not valid: "+ tikaLink.getUri());
- }
- }
- }
- }
- return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
- } catch (SAXException ex) {
- throw new DroidsException("Failure parsing document " + task.getId(), ex);
- } catch (TikaException ex) {
- throw new DroidsException("Failure parsing document " + task.getId(), ex);
- } finally {
- instream.close();
- }
- }
+ return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
+ } catch (SAXException ex) {
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
+ } catch (TikaException ex) {
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
+ } finally {
+ instream.close();
+ }
+ }
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Tue Dec 18 08:47:39 2012
@@ -50,76 +50,74 @@ import org.apache.tika.sax.TeeContentHan
import org.xml.sax.SAXException;
/**
- *
* @deprecated Use TikaDocumentParser instead as it handles HTML just fine and performs the same operations.
- *
*/
@Deprecated
public class TikaHtmlParser implements TikaParser {
- protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
+ protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
- @Override
- public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException {
- // Init Tika objects
- org.apache.tika.parser.Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- String charset = entity.getCharset();
- if (charset == null) {
- charset = "UTF-8";
- }
-
- StringWriter dataBuffer = new StringWriter();
- StringWriter bodyBuffer = new StringWriter();
- StringWriter mainContentBuffer = new StringWriter();
-
- SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
- TransformerHandler xmlHandler;
- try {
- xmlHandler = factory.newTransformerHandler();
- } catch (TransformerConfigurationException e) {
- throw new DroidsException(e);
+ @Override
+ public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException {
+ // Init Tika objects
+ org.apache.tika.parser.Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ String charset = entity.getCharset();
+ if (charset == null) {
+ charset = "UTF-8";
+ }
+
+ StringWriter dataBuffer = new StringWriter();
+ StringWriter bodyBuffer = new StringWriter();
+ StringWriter mainContentBuffer = new StringWriter();
+
+ SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+ TransformerHandler xmlHandler;
+ try {
+ xmlHandler = factory.newTransformerHandler();
+ } catch (TransformerConfigurationException e) {
+ throw new DroidsException(e);
+ }
+ xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ xmlHandler.setResult(new StreamResult(dataBuffer));
+
+ BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
+ BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+ LinkContentHandler linkHandler = new LinkContentHandler();
+
+ TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler);
+
+ InputStream instream = entity.obtainContent();
+ try {
+ parser.parse(instream, parallelHandler, metadata, new ParseContext());
+
+ ArrayList<Link> extractedTasks = new ArrayList<Link>();
+ if (task instanceof Link) {
+ int depth = task.getDepth() + 1;
+ for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ try {
+ URI uri = new URI(tikaLink.getUri());
+ // Test to see if the scheme is empty
+ // This would indicate a relative URL, so resolve it against the task URI
+ if (uri.getScheme() == null) {
+ uri = ((Link) task).getURI().resolve(uri);
+ }
+ extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText()));
+ } catch (URISyntaxException e) {
+ if (log.isWarnEnabled()) {
+ log.warn("URI not valid: " + tikaLink.getUri());
+ }
+ }
+ }
+ }
+ return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
+ } catch (SAXException ex) {
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
+ } catch (TikaException ex) {
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
+ } finally {
+ instream.close();
+ }
}
- xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- xmlHandler.setResult(new StreamResult(dataBuffer));
-
- BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
- BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
- LinkContentHandler linkHandler = new LinkContentHandler();
-
- TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );
-
- InputStream instream = entity.obtainContent();
- try {
- parser.parse(instream, parallelHandler, metadata, new ParseContext());
-
- ArrayList<Link> extractedTasks = new ArrayList<Link>();
- if (task instanceof Link) {
- int depth = task.getDepth() + 1;
- for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
- try {
- URI uri = new URI(tikaLink.getUri());
- // Test to see if the scheme is empty
- // This would indicate a relative URL, so resolve it against the task URI
- if(uri.getScheme() == null) {
- uri = ((Link) task).getURI().resolve(uri);
- }
- extractedTasks.add(new LinkTask((Link)task, uri, depth, tikaLink.getText()));
- } catch (URISyntaxException e) {
- if(log.isWarnEnabled()) {
- log.warn("URI not valid: "+ tikaLink.getUri());
- }
- }
- }
- }
- return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
- } catch (SAXException ex) {
- throw new DroidsException("Failure parsing document " + task.getId(), ex);
- } catch (TikaException ex) {
- throw new DroidsException("Failure parsing document " + task.getId(), ex);
- } finally {
- instream.close();
- }
- }
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java Tue Dec 18 08:47:39 2012
@@ -21,36 +21,41 @@ import org.apache.tika.metadata.Metadata
public interface TikaParse extends Parse {
- /**
- * Retrieves the main content of the parsed document.
- * Uses Tika's plugin in for Boilerpipe.
- * @return plain text result with boilerplate removed
- */
- public String getMainContent();
-
- /**
- * Extracted meta data from the document. This can include
- * meta tags from within an HTML document
- * @return metadata object from the parse
- */
- public Metadata getMetadata();
-
- /**
- * The HTML representation of the document.
- * @return The HTML representation of the document.
- */
- public String getXml();
-
- /**
- * Plain text representation of the document.
- * @return plain text version without formatting
- */
- public String getPlainText();
-
- /**
- * If the document should be indexed or not.
- * This can be determined from metadata or other methods
- * @return false if the document shouldn't be indexed, true otherwise
- */
- public boolean isIndexed();
+ /**
+ * Retrieves the main content of the parsed document.
+ * Uses Tika's plugin in for Boilerpipe.
+ *
+ * @return plain text result with boilerplate removed
+ */
+ public String getMainContent();
+
+ /**
+ * Extracted meta data from the document. This can include
+ * meta tags from within an HTML document
+ *
+ * @return metadata object from the parse
+ */
+ public Metadata getMetadata();
+
+ /**
+ * The HTML representation of the document.
+ *
+ * @return The HTML representation of the document.
+ */
+ public String getXml();
+
+ /**
+ * Plain text representation of the document.
+ *
+ * @return plain text version without formatting
+ */
+ public String getPlainText();
+
+ /**
+ * If the document should be indexed or not.
+ * This can be determined from metadata or other methods
+ *
+ * @return false if the document shouldn't be indexed, true otherwise
+ */
+ public boolean isIndexed();
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java Tue Dec 18 08:47:39 2012
@@ -26,60 +26,60 @@ import org.apache.tika.metadata.Metadata
public class TikaParseImpl extends ParseImpl implements TikaParse {
- private String plainText;
- private String mainContent;
- private Metadata metadata;
-
- public TikaParseImpl(String text, Collection<Link> outlinks) {
- super(text,outlinks);
- }
-
- public TikaParseImpl(String text, Object data, Collection<Link> outlinks) {
- super(text,data,outlinks);
- }
-
- public TikaParseImpl(String xmlContent, ArrayList<Link> extractedTasks,
- String plainText, String mainContent, Metadata metadata) {
- this(xmlContent, extractedTasks);
- this.plainText = plainText;
- this.mainContent = mainContent;
- this.metadata = metadata;
- }
-
- @Override
- public String getMainContent() {
- return mainContent;
- }
-
- @Override
- public Metadata getMetadata() {
- return metadata;
- }
-
- @Override
- public String getXml() {
- return super.text;
- }
-
- @Override
- public String getPlainText() {
- return plainText;
- }
-
- @Override
- public boolean isFollowed() {
- if(metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("nofollow")) {
- return false;
- }
- return true;
- }
-
- @Override
- public boolean isIndexed() {
- if(metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("noindex")) {
- return false;
+ private String plainText;
+ private String mainContent;
+ private Metadata metadata;
+
+ public TikaParseImpl(String text, Collection<Link> outlinks) {
+ super(text, outlinks);
+ }
+
+ public TikaParseImpl(String text, Object data, Collection<Link> outlinks) {
+ super(text, data, outlinks);
+ }
+
+ public TikaParseImpl(String xmlContent, ArrayList<Link> extractedTasks,
+ String plainText, String mainContent, Metadata metadata) {
+ this(xmlContent, extractedTasks);
+ this.plainText = plainText;
+ this.mainContent = mainContent;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public String getMainContent() {
+ return mainContent;
+ }
+
+ @Override
+ public Metadata getMetadata() {
+ return metadata;
+ }
+
+ @Override
+ public String getXml() {
+ return super.text;
+ }
+
+ @Override
+ public String getPlainText() {
+ return plainText;
+ }
+
+ @Override
+ public boolean isFollowed() {
+ if (metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("nofollow")) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public boolean isIndexed() {
+ if (metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("noindex")) {
+ return false;
+ }
+ return true;
}
- return true;
- }
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml Tue Dec 18 08:47:39 2012
@@ -16,14 +16,14 @@
limitations under the License.
-->
<project xmlns="http://maven.apache.org/DECORATION/1.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/DECORATION/1.0.0 http://maven.apache.org/xsd/decoration-1.0.0.xsd">
- <body>
- <menu ref="parent" />
-
- <menu name="JavaDocs">
- <item name="JavaDocs" href="apidocs/index.html"/>
- <item name="Test JavaDocs" href="testapidocs/index.html"/>
- </menu>
- </body>
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/DECORATION/1.0.0 http://maven.apache.org/xsd/decoration-1.0.0.xsd">
+ <body>
+ <menu ref="parent"/>
+
+ <menu name="JavaDocs">
+ <item name="JavaDocs" href="apidocs/index.html"/>
+ <item name="Test JavaDocs" href="testapidocs/index.html"/>
+ </menu>
+ </body>
</project>
\ No newline at end of file
Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java Tue Dec 18 08:47:39 2012
@@ -19,10 +19,9 @@ package org.apache.droids.tika;
import junit.framework.TestCase;
public class TikaHtmlParserTest extends TestCase {
-
- public void testSomething() throws Exception
- {
- // TODO -- test stuff!
- assertTrue( true );
- }
+
+ public void testSomething() throws Exception {
+ // TODO -- test stuff!
+ assertTrue(true);
+ }
}
\ No newline at end of file