You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by to...@apache.org on 2012/12/18 08:48:00 UTC

svn commit: r1423339 [5/5] - in /incubator/droids/branches/0.2.x-cleanup: droids-crawler/ droids-crawler/src/main/java/org/apache/droids/crawler/ droids-crawler/src/main/java/org/apache/droids/protocol/http/ droids-crawler/src/test/java/org/apache/droi...

Modified: incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/java/org/apache/droids/dynamic/TestSimpleDroid.java Tue Dec 18 08:47:39 2012
@@ -36,63 +36,63 @@ import org.springframework.context.suppo
 
 public class TestSimpleDroid {
 
-	protected LocalHttpServer testserver;
+    protected LocalHttpServer testserver;
 
-	private final static ApplicationContext context = new ClassPathXmlApplicationContext(
-			"classpath:/droids-core-test-context.xml");
+    private final static ApplicationContext context = new ClassPathXmlApplicationContext(
+            "classpath:/droids-core-test-context.xml");
 
-	private DroidsConfig droidsConfig = null;
+    private DroidsConfig droidsConfig = null;
 
-	@Before
-	public void setUp() throws Exception {
-		this.droidsConfig = (DroidsConfig) TestSimpleDroid.context
-				.getBean("org.apache.droids.dynamic.DroidsConfig");
-		this.testserver = new LocalHttpServer();
-	}
-
-	@Test
-	public void testReportCrawlingDroid() throws Exception {
-		this.testserver.register("*", new ResourceHandler());
-		this.testserver.start();
-
-		String baseURI = "http:/" + this.testserver.getServiceAddress();
-		String targetURI = baseURI + "/start_html";
-
-		Droid<Link> droid = createSimpleReportCrawlingDroid(targetURI);
-
-		droid.init();
-		droid.start();
-		droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
-
-		Assert.assertFalse(ReportHandler.getReport().isEmpty());
-		Assert.assertEquals(5, ReportHandler.getReport().size());
-		Assert.assertTrue(ReportHandler.getReport().contains(
-				baseURI + "/start_html"));
-		Assert.assertTrue(ReportHandler.getReport().contains(
-				baseURI + "/page1_html"));
-		Assert.assertTrue(ReportHandler.getReport().contains(
-				baseURI + "/page2_html"));
-		Assert.assertTrue(ReportHandler.getReport().contains(
-				baseURI + "/page3_html"));
-		Assert.assertTrue(ReportHandler.getReport().contains(
-				baseURI + "/page4_html"));
-
-		ReportHandler.recycle();
-	}
-
-	private Droid<Link> createSimpleReportCrawlingDroid(final String targetURI) {
-		Droid<Link> droid = this.droidsConfig.getDroid("report");
-
-		Assert.assertFalse("Droid is null.", droid == null);
-		Assert.assertTrue(
-				"The test droid must be an instance of ReportCrawlingDroid",
-				droid instanceof ReportCrawlingDroid);
-
-		final List<String> locations = new ArrayList<String>();
-		locations.add(targetURI);
-		((CrawlingDroid) droid).setInitialLocations(locations);
+    @Before
+    public void setUp() throws Exception {
+        this.droidsConfig = (DroidsConfig) TestSimpleDroid.context
+                .getBean("org.apache.droids.dynamic.DroidsConfig");
+        this.testserver = new LocalHttpServer();
+    }
+
+    @Test
+    public void testReportCrawlingDroid() throws Exception {
+        this.testserver.register("*", new ResourceHandler());
+        this.testserver.start();
+
+        String baseURI = "http:/" + this.testserver.getServiceAddress();
+        String targetURI = baseURI + "/start_html";
+
+        Droid<Link> droid = createSimpleReportCrawlingDroid(targetURI);
+
+        droid.init();
+        droid.start();
+        droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
+
+        Assert.assertFalse(ReportHandler.getReport().isEmpty());
+        Assert.assertEquals(5, ReportHandler.getReport().size());
+        Assert.assertTrue(ReportHandler.getReport().contains(
+                baseURI + "/start_html"));
+        Assert.assertTrue(ReportHandler.getReport().contains(
+                baseURI + "/page1_html"));
+        Assert.assertTrue(ReportHandler.getReport().contains(
+                baseURI + "/page2_html"));
+        Assert.assertTrue(ReportHandler.getReport().contains(
+                baseURI + "/page3_html"));
+        Assert.assertTrue(ReportHandler.getReport().contains(
+                baseURI + "/page4_html"));
+
+        ReportHandler.recycle();
+    }
+
+    private Droid<Link> createSimpleReportCrawlingDroid(final String targetURI) {
+        Droid<Link> droid = this.droidsConfig.getDroid("report");
+
+        Assert.assertFalse("Droid is null.", droid == null);
+        Assert.assertTrue(
+                "The test droid must be an instance of ReportCrawlingDroid",
+                droid instanceof ReportCrawlingDroid);
+
+        final List<String> locations = new ArrayList<String>();
+        locations.add(targetURI);
+        ((CrawlingDroid) droid).setInitialLocations(locations);
 
-		return droid;
-	}
+        return droid;
+    }
 
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-spring/src/test/resources/droids-core-test-context.xml Tue Dec 18 08:47:39 2012
@@ -15,105 +15,105 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-  <!-- 
-     Using your own context
-    +++++++++++++++++++++++++ 
-    The easiest way is to 
-    a) create a droids-your-context.xml 
-    b) add:
-       <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
-    c) implement your own beans which will override the imported ones
-    d) Call the ant target like:
-       ant droids.crawl default -Ddroids.spring.context=PATH/droids-your-context.xml
- -->
+<!--
+   Using your own context
+  +++++++++++++++++++++++++
+  The easiest way is to
+  a) create a droids-your-context.xml
+  b) add:
+     <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
+  c) implement your own beans which will override the imported ones
+  d) Call the ant target like:
+     ant droids.crawl default -Ddroids.spring.context=PATH/droids-your-context.xml
+-->
 <beans xmlns="http://www.springframework.org/schema/beans"
-  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xmlns:configurator="http://cocoon.apache.org/schema/configurator"
-  xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:configurator="http://cocoon.apache.org/schema/configurator"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
        http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.1.0.xsd">
-  
-  <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
-  
-  <!-- configuration properties file -->
-  <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
-    <property name="locations" value="classpath:/droids-core.properties"/>
-  </bean>
-  
-  <bean name="taskExceptionHandler"
-    class="org.apache.droids.impl.DefaultTaskExceptionHandler">
-  </bean>
-  
-  <bean name="taskMaster"
-    class="org.apache.droids.impl.MultiThreadedTaskMaster">
-    <property name="exceptionHandler" ref="taskExceptionHandler" />
-    <property name="delayTimer" ref="org.apache.droids.delay.SimpleDelayTimer"/>
-    <!--<property name="maxThreads" value="${droids.maxThreads}"/>-->
-  </bean>
-  
-  <!-- Droids -->
-  <bean name="org.apache.droids.api.Droid/report"
-    class="org.apache.droids.robot.crawler.ReportCrawlingDroid">
-    <constructor-arg ref="java.util.LinkedList" />
-    <constructor-arg ref="taskMaster" />
-    
-    
-    <property name="protocolFactory" ref="org.apache.droids.helper.factories.ProtocolFactory"/>
-    <property name="parserFactory" ref="org.apache.droids.helper.factories.ParserFactory"/>
-    <property name="filtersFactory" ref="org.apache.droids.helper.factories.FilterFactory"/>
-  </bean>
-  <!-- Queue -->
-  <bean id="java.util.LinkedList"
-    class="java.util.LinkedList">
-  </bean>
-  <!-- Protocol -->
-  <bean 
-    name="org.apache.droids.api.Protocol/http"
-    class="org.apache.droids.protocol.http.HttpProtocol" scope="singleton">
-    <property name="userAgent" value="DROIDS-crawler-x-m01y08"/>
-    <property name="forceAllow" value="${droids.protocol.http.force}"/>
-  </bean>
-  <bean name="org.apache.droids.api.Protocol/file"
-    class="org.apache.droids.protocol.file.FileProtocol" scope="singleton"/>
-  <!-- Parser -->
-  <bean 
-    name="text/html"
-    class="org.apache.droids.parse.html.HtmlParser">
-    <property name="elements">
-    <map>
-        <entry key="a" value="href"/>
-        <entry key="link" value="href"/>
-        <entry key="img" value="src"/>
-        <entry key="script" value="src"/>
-     </map>
-     </property>
-    </bean>
-  <!-- Filter -->
-  <bean
-    name="org.apache.droids.api.URLFilter/org.apache.droids.net.RegexURLFilter"
-    class="org.apache.droids.net.RegexURLFilter">
-    <property name="file" value="${droids.filter.regex}"/>
-  </bean>
-  <!-- Handler -->
-  <bean 
-    name="org.apache.droids.api.Handler/org.apache.droids.handle.SysoutHandler"
-    class="org.apache.droids.handle.SysoutHandler"/>
-  <bean 
-    name="org.apache.droids.api.Handler/org.apache.droids.handle.SaveHandler"
-    class="org.apache.droids.handle.SaveHandler">
-    <property name="saveContentHandlerStrategy" 
-    ref="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy" />
-  </bean>
-  <bean
-    name="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"
-    class="org.apache.droids.handle.DefaultSaveContentHandlerStrategy">
-      <property name="includeHost" value="true" />
-      <property name="outputDir" value="tmp/" />
-  </bean>
-
-  
-  <bean 
-    name="org.apache.droids.delay.SimpleDelayTimer"
-    class="org.apache.droids.delay.SimpleDelayTimer">
-    <property name="delayMillis" value="${droids.delay.request}"/>
-  </bean>
+
+    <import resource="classpath:/org/apache/droids/dynamic/droids-core-context.xml"/>
+
+    <!-- configuration properties file -->
+    <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
+        <property name="locations" value="classpath:/droids-core.properties"/>
+    </bean>
+
+    <bean name="taskExceptionHandler"
+          class="org.apache.droids.impl.DefaultTaskExceptionHandler">
+    </bean>
+
+    <bean name="taskMaster"
+          class="org.apache.droids.impl.MultiThreadedTaskMaster">
+        <property name="exceptionHandler" ref="taskExceptionHandler"/>
+        <property name="delayTimer" ref="org.apache.droids.delay.SimpleDelayTimer"/>
+        <!--<property name="maxThreads" value="${droids.maxThreads}"/>-->
+    </bean>
+
+    <!-- Droids -->
+    <bean name="org.apache.droids.api.Droid/report"
+          class="org.apache.droids.robot.crawler.ReportCrawlingDroid">
+        <constructor-arg ref="java.util.LinkedList"/>
+        <constructor-arg ref="taskMaster"/>
+
+
+        <property name="protocolFactory" ref="org.apache.droids.helper.factories.ProtocolFactory"/>
+        <property name="parserFactory" ref="org.apache.droids.helper.factories.ParserFactory"/>
+        <property name="filtersFactory" ref="org.apache.droids.helper.factories.FilterFactory"/>
+    </bean>
+    <!-- Queue -->
+    <bean id="java.util.LinkedList"
+          class="java.util.LinkedList">
+    </bean>
+    <!-- Protocol -->
+    <bean
+            name="org.apache.droids.api.Protocol/http"
+            class="org.apache.droids.protocol.http.HttpProtocol" scope="singleton">
+        <property name="userAgent" value="DROIDS-crawler-x-m01y08"/>
+        <property name="forceAllow" value="${droids.protocol.http.force}"/>
+    </bean>
+    <bean name="org.apache.droids.api.Protocol/file"
+          class="org.apache.droids.protocol.file.FileProtocol" scope="singleton"/>
+    <!-- Parser -->
+    <bean
+            name="text/html"
+            class="org.apache.droids.parse.html.HtmlParser">
+        <property name="elements">
+            <map>
+                <entry key="a" value="href"/>
+                <entry key="link" value="href"/>
+                <entry key="img" value="src"/>
+                <entry key="script" value="src"/>
+            </map>
+        </property>
+    </bean>
+    <!-- Filter -->
+    <bean
+            name="org.apache.droids.api.URLFilter/org.apache.droids.net.RegexURLFilter"
+            class="org.apache.droids.net.RegexURLFilter">
+        <property name="file" value="${droids.filter.regex}"/>
+    </bean>
+    <!-- Handler -->
+    <bean
+            name="org.apache.droids.api.Handler/org.apache.droids.handle.SysoutHandler"
+            class="org.apache.droids.handle.SysoutHandler"/>
+    <bean
+            name="org.apache.droids.api.Handler/org.apache.droids.handle.SaveHandler"
+            class="org.apache.droids.handle.SaveHandler">
+        <property name="saveContentHandlerStrategy"
+                  ref="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"/>
+    </bean>
+    <bean
+            name="org.apache.droids.api.Handler/org.apache.droids.handle.DefaultSaveContentHandlerStrategy"
+            class="org.apache.droids.handle.DefaultSaveContentHandlerStrategy">
+        <property name="includeHost" value="true"/>
+        <property name="outputDir" value="tmp/"/>
+    </bean>
+
+
+    <bean
+            name="org.apache.droids.delay.SimpleDelayTimer"
+            class="org.apache.droids.delay.SimpleDelayTimer">
+        <property name="delayMillis" value="${droids.delay.request}"/>
+    </bean>
 </beans>

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/pom.xml Tue Dec 18 08:47:39 2012
@@ -15,53 +15,54 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <artifactId>droids</artifactId>
-    <groupId>org.apache.droids</groupId>
-    <version>0.3.0-incubating-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-  <artifactId>droids-tika</artifactId>
-  <name>Apache Droids Tika</name>
-  <inceptionYear>2007</inceptionYear>
-  <description>Apache Droids Tika Parser</description>
-  <packaging>jar</packaging>  
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <artifactId>droids</artifactId>
+        <groupId>org.apache.droids</groupId>
+        <version>0.3.0-incubating-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>droids-tika</artifactId>
+    <name>Apache Droids Tika</name>
+    <inceptionYear>2007</inceptionYear>
+    <description>Apache Droids Tika Parser</description>
+    <packaging>jar</packaging>
 
-  <properties>
-    <tika-release-version>1.1</tika-release-version>
-  </properties>
+    <properties>
+        <tika-release-version>1.1</tika-release-version>
+    </properties>
 
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.droids</groupId>
-      <artifactId>droids-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-    	<groupId>org.apache.tika</groupId>
-    	<artifactId>tika-core</artifactId>
-    	<version>${tika-release-version}</version>
-    </dependency>
-    <dependency>
-    	<groupId>org.apache.tika</groupId>
-    	<artifactId>tika-parsers</artifactId>
-    	<version>${tika-release-version}</version>
-    	<exclusions>
-    		<exclusion>
-    			<artifactId>commons-logging</artifactId>
-    			<groupId>commons-logging</groupId>
-    		</exclusion>
-    	</exclusions>
-    </dependency>
-    <!-- test dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>${junit.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.droids</groupId>
+            <artifactId>droids-core</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${tika-release-version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <version>${tika-release-version}</version>
+            <exclusions>
+                <exclusion>
+                    <artifactId>commons-logging</artifactId>
+                    <groupId>commons-logging</groupId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <!-- test dependencies -->
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>${junit.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
 </project>

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java Tue Dec 18 08:47:39 2012
@@ -54,76 +54,75 @@ import org.xml.sax.SAXException;
 /**
  * Parses documents using Tika.
  * Any document type that Tika can handle, can be handled by this class,
- * including HTML. 
- *
+ * including HTML.
  */
 public class TikaDocumentParser implements TikaParser {
 
-  protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class);
-  
-  @Override
-  public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
-      IOException {
-    // Init Tika objects
-    org.apache.tika.parser.Parser parser = new AutoDetectParser();
-    Metadata metadata = new Metadata();
-    
-    String charset = entity.getCharset();
-    if (charset == null) {
-      charset = "UTF-8";
-    }
-    
-    StringWriter dataBuffer = new StringWriter();
-    StringWriter bodyBuffer = new StringWriter();
-    StringWriter mainContentBuffer = new StringWriter();
-     
-    SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
-    TransformerHandler xmlHandler;
-    try {
-      xmlHandler = factory.newTransformerHandler();
-    } catch (TransformerConfigurationException e) {
-      throw new DroidsException(e);
-    }
-    xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-    xmlHandler.setResult(new StreamResult(dataBuffer));
-    
-    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
-    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
-    LinkContentHandler linkHandler = new LinkContentHandler();
-    
-    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );
-
-    InputStream instream = entity.obtainContent();
-    try {
-      parser.parse(instream, parallelHandler, metadata, new ParseContext());
-      
-      ArrayList<Link> extractedTasks = new ArrayList<Link>();
-      int depth = task.getDepth() + 1;
-      if (task instanceof LinkTask) {
-	      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
-	        try {
-	          URI uri = new URI(tikaLink.getUri());
-            // Test to see if the scheme is empty
-            // This would indicate a relative URL, so resolve it against the task URI
-            if(uri.getScheme() == null) {
-              uri = ((Link) task).getURI().resolve(uri);
+    protected static final Logger LOG = LoggerFactory.getLogger(TikaDocumentParser.class);
+
+    @Override
+    public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
+            IOException {
+        // Init Tika objects
+        org.apache.tika.parser.Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        String charset = entity.getCharset();
+        if (charset == null) {
+            charset = "UTF-8";
+        }
+
+        StringWriter dataBuffer = new StringWriter();
+        StringWriter bodyBuffer = new StringWriter();
+        StringWriter mainContentBuffer = new StringWriter();
+
+        SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+        TransformerHandler xmlHandler;
+        try {
+            xmlHandler = factory.newTransformerHandler();
+        } catch (TransformerConfigurationException e) {
+            throw new DroidsException(e);
+        }
+        xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        xmlHandler.setResult(new StreamResult(dataBuffer));
+
+        BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
+        BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+        LinkContentHandler linkHandler = new LinkContentHandler();
+
+        TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler);
+
+        InputStream instream = entity.obtainContent();
+        try {
+            parser.parse(instream, parallelHandler, metadata, new ParseContext());
+
+            ArrayList<Link> extractedTasks = new ArrayList<Link>();
+            int depth = task.getDepth() + 1;
+            if (task instanceof LinkTask) {
+                for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+                    try {
+                        URI uri = new URI(tikaLink.getUri());
+                        // Test to see if the scheme is empty
+                        // This would indicate a relative URL, so resolve it against the task URI
+                        if (uri.getScheme() == null) {
+                            uri = ((Link) task).getURI().resolve(uri);
+                        }
+                        extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText()));
+                    } catch (URISyntaxException e) {
+                        if (LOG.isWarnEnabled()) {
+                            LOG.warn("URI not valid: " + tikaLink.getUri());
+                        }
+                    }
+                }
             }
-            extractedTasks.add(new LinkTask((Link)task, uri, depth, tikaLink.getText()));
-	        } catch (URISyntaxException e) {
-	          if(LOG.isWarnEnabled()) {
-	            LOG.warn("URI not valid: "+ tikaLink.getUri());
-	          }
-	        }
-	      }
-      }
-      return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
-    } catch (SAXException ex) {
-      throw new DroidsException("Failure parsing document " + task.getId(), ex);
-    } catch (TikaException ex) {
-      throw new DroidsException("Failure parsing document " + task.getId(), ex);
-    } finally {
-      instream.close();
-    } 
-  }
+            return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
+        } catch (SAXException ex) {
+            throw new DroidsException("Failure parsing document " + task.getId(), ex);
+        } catch (TikaException ex) {
+            throw new DroidsException("Failure parsing document " + task.getId(), ex);
+        } finally {
+            instream.close();
+        }
+    }
 
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Tue Dec 18 08:47:39 2012
@@ -50,76 +50,74 @@ import org.apache.tika.sax.TeeContentHan
 import org.xml.sax.SAXException;
 
 /**
- * 
  * @deprecated Use TikaDocumentParser instead as it handles HTML just fine and performs the same operations.
- *
  */
 @Deprecated
 public class TikaHtmlParser implements TikaParser {
 
-  protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
+    protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
 
-  @Override
-  public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException {
-    // Init Tika objects
-    org.apache.tika.parser.Parser parser = new AutoDetectParser();
-    Metadata metadata = new Metadata();
-    
-    String charset = entity.getCharset();
-    if (charset == null) {
-      charset = "UTF-8";
-    }
-    
-    StringWriter dataBuffer = new StringWriter();
-    StringWriter bodyBuffer = new StringWriter();
-    StringWriter mainContentBuffer = new StringWriter();
-     
-    SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
-    TransformerHandler xmlHandler;
-    try {
-      xmlHandler = factory.newTransformerHandler();
-    } catch (TransformerConfigurationException e) {
-      throw new DroidsException(e);
+    @Override
+    public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException {
+        // Init Tika objects
+        org.apache.tika.parser.Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        String charset = entity.getCharset();
+        if (charset == null) {
+            charset = "UTF-8";
+        }
+
+        StringWriter dataBuffer = new StringWriter();
+        StringWriter bodyBuffer = new StringWriter();
+        StringWriter mainContentBuffer = new StringWriter();
+
+        SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+        TransformerHandler xmlHandler;
+        try {
+            xmlHandler = factory.newTransformerHandler();
+        } catch (TransformerConfigurationException e) {
+            throw new DroidsException(e);
+        }
+        xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        xmlHandler.setResult(new StreamResult(dataBuffer));
+
+        BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
+        BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+        LinkContentHandler linkHandler = new LinkContentHandler();
+
+        TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler);
+
+        InputStream instream = entity.obtainContent();
+        try {
+            parser.parse(instream, parallelHandler, metadata, new ParseContext());
+
+            ArrayList<Link> extractedTasks = new ArrayList<Link>();
+            if (task instanceof Link) {
+                int depth = task.getDepth() + 1;
+                for (org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+                    try {
+                        URI uri = new URI(tikaLink.getUri());
+                        // Test to see if the scheme is empty
+                        // This would indicate a relative URL, so resolve it against the task URI
+                        if (uri.getScheme() == null) {
+                            uri = ((Link) task).getURI().resolve(uri);
+                        }
+                        extractedTasks.add(new LinkTask((Link) task, uri, depth, tikaLink.getText()));
+                    } catch (URISyntaxException e) {
+                        if (log.isWarnEnabled()) {
+                            log.warn("URI not valid: " + tikaLink.getUri());
+                        }
+                    }
+                }
+            }
+            return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
+        } catch (SAXException ex) {
+            throw new DroidsException("Failure parsing document " + task.getId(), ex);
+        } catch (TikaException ex) {
+            throw new DroidsException("Failure parsing document " + task.getId(), ex);
+        } finally {
+            instream.close();
+        }
     }
-    xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-    xmlHandler.setResult(new StreamResult(dataBuffer));
-    
-    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
-    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
-    LinkContentHandler linkHandler = new LinkContentHandler();
-    
-    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );
-
-    InputStream instream = entity.obtainContent();
-    try {
-      parser.parse(instream, parallelHandler, metadata, new ParseContext());
-      
-      ArrayList<Link> extractedTasks = new ArrayList<Link>();
-      if (task instanceof Link) {
-	      int depth = task.getDepth() + 1;
-	      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
-	        try {
-	          URI uri = new URI(tikaLink.getUri());
-	          // Test to see if the scheme is empty
-	          // This would indicate a relative URL, so resolve it against the task URI
-	          if(uri.getScheme() == null) {
-	            uri = ((Link) task).getURI().resolve(uri);
-	          }
-	          extractedTasks.add(new LinkTask((Link)task, uri, depth, tikaLink.getText()));
-	        } catch (URISyntaxException e) {
-	          if(log.isWarnEnabled()) {
-	            log.warn("URI not valid: "+ tikaLink.getUri());
-	          }
-	        }
-	      }
-      }
-      return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
-    } catch (SAXException ex) {
-      throw new DroidsException("Failure parsing document " + task.getId(), ex);
-    } catch (TikaException ex) {
-      throw new DroidsException("Failure parsing document " + task.getId(), ex);
-    } finally {
-      instream.close();
-    } 
-  }
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/api/TikaParse.java Tue Dec 18 08:47:39 2012
@@ -21,36 +21,41 @@ import org.apache.tika.metadata.Metadata
 
 public interface TikaParse extends Parse {
 
-	/**
-	 * Retrieves the main content of the parsed document.
-	 * Uses Tika's plugin in for Boilerpipe.
-	 * @return plain text result with boilerplate removed
-	 */
-  public String getMainContent();
-  
-  /**
-   * Extracted meta data from the document. This can include
-   * meta tags from within an HTML document
-   * @return metadata object from the parse
-   */
-  public Metadata getMetadata();
-  
-  /**
-   * The HTML representation of the document.
-   * @return The HTML representation of the document.
-   */
-  public String getXml();
-  
-  /**
-   * Plain text representation of the document.
-   * @return plain text version without formatting
-   */
-  public String getPlainText();
-  
-  /**
-   * If the document should be indexed or not.
-   * This can be determined from metadata or other methods
-   * @return false if the document shouldn't be indexed, true otherwise
-   */
-  public boolean isIndexed();
+    /**
+     * Retrieves the main content of the parsed document.
+     * Uses Tika's plugin in for Boilerpipe.
+     *
+     * @return plain text result with boilerplate removed
+     */
+    public String getMainContent();
+
+    /**
+     * Extracted meta data from the document. This can include
+     * meta tags from within an HTML document
+     *
+     * @return metadata object from the parse
+     */
+    public Metadata getMetadata();
+
+    /**
+     * The HTML representation of the document.
+     *
+     * @return The HTML representation of the document.
+     */
+    public String getXml();
+
+    /**
+     * Plain text representation of the document.
+     *
+     * @return plain text version without formatting
+     */
+    public String getPlainText();
+
+    /**
+     * If the document should be indexed or not.
+     * This can be determined from metadata or other methods
+     *
+     * @return false if the document shouldn't be indexed, true otherwise
+     */
+    public boolean isIndexed();
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/main/java/org/apache/droids/tika/parse/TikaParseImpl.java Tue Dec 18 08:47:39 2012
@@ -26,60 +26,60 @@ import org.apache.tika.metadata.Metadata
 
 public class TikaParseImpl extends ParseImpl implements TikaParse {
 
-  private String plainText;
-  private String mainContent;
-  private Metadata metadata;
-  
-  public TikaParseImpl(String text, Collection<Link> outlinks) {
-    super(text,outlinks);
-  }
-  
-  public TikaParseImpl(String text, Object data, Collection<Link> outlinks) {
-    super(text,data,outlinks);
-  }
-
-  public TikaParseImpl(String xmlContent, ArrayList<Link> extractedTasks,
-      String plainText, String mainContent, Metadata metadata) {
-    this(xmlContent, extractedTasks);
-    this.plainText = plainText;
-    this.mainContent = mainContent;
-    this.metadata = metadata;
-  }
-
-  @Override
-  public String getMainContent() {
-    return mainContent;
-  }
-
-  @Override
-  public Metadata getMetadata() {
-    return metadata;
-  }
-
-  @Override
-  public String getXml() {
-    return super.text;
-  }
-
-  @Override
-  public String getPlainText() {
-    return plainText;
-  }
-
-  @Override
-  public boolean isFollowed() {
-    if(metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("nofollow")) {
-      return false;
-    }
-    return true;
-  }
-
-  @Override
-  public boolean isIndexed() {
-    if(metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("noindex")) {
-      return false;
+    private String plainText;
+    private String mainContent;
+    private Metadata metadata;
+
+    public TikaParseImpl(String text, Collection<Link> outlinks) {
+        super(text, outlinks);
+    }
+
+    public TikaParseImpl(String text, Object data, Collection<Link> outlinks) {
+        super(text, data, outlinks);
+    }
+
+    public TikaParseImpl(String xmlContent, ArrayList<Link> extractedTasks,
+                         String plainText, String mainContent, Metadata metadata) {
+        this(xmlContent, extractedTasks);
+        this.plainText = plainText;
+        this.mainContent = mainContent;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public String getMainContent() {
+        return mainContent;
+    }
+
+    @Override
+    public Metadata getMetadata() {
+        return metadata;
+    }
+
+    @Override
+    public String getXml() {
+        return super.text;
+    }
+
+    @Override
+    public String getPlainText() {
+        return plainText;
+    }
+
+    @Override
+    public boolean isFollowed() {
+        if (metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("nofollow")) {
+            return false;
+        }
+        return true;
+    }
+
+    @Override
+    public boolean isIndexed() {
+        if (metadata.get("robots") != null && metadata.get("robots").toLowerCase().contains("noindex")) {
+            return false;
+        }
+        return true;
     }
-    return true;
-  }
 
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/site/site.xml Tue Dec 18 08:47:39 2012
@@ -16,14 +16,14 @@
  limitations under the License.
 -->
 <project xmlns="http://maven.apache.org/DECORATION/1.0.0"
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-    xsi:schemaLocation="http://maven.apache.org/DECORATION/1.0.0 http://maven.apache.org/xsd/decoration-1.0.0.xsd">
-  <body>
-    <menu ref="parent" />
-    
-    <menu name="JavaDocs"> 
-      <item name="JavaDocs" href="apidocs/index.html"/>
-      <item name="Test JavaDocs" href="testapidocs/index.html"/>
-    </menu>
-  </body>
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/DECORATION/1.0.0 http://maven.apache.org/xsd/decoration-1.0.0.xsd">
+    <body>
+        <menu ref="parent"/>
+
+        <menu name="JavaDocs">
+            <item name="JavaDocs" href="apidocs/index.html"/>
+            <item name="Test JavaDocs" href="testapidocs/index.html"/>
+        </menu>
+    </body>
 </project>
\ No newline at end of file

Modified: incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java?rev=1423339&r1=1423338&r2=1423339&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-tika/src/test/java/org/apache/droids/tika/TikaHtmlParserTest.java Tue Dec 18 08:47:39 2012
@@ -19,10 +19,9 @@ package org.apache.droids.tika;
 import junit.framework.TestCase;
 
 public class TikaHtmlParserTest extends TestCase {
- 
-  public void testSomething() throws Exception
-  {
-    // TODO -- test stuff!
-    assertTrue( true );
-  }
+
+    public void testSomething() throws Exception {
+        // TODO -- test stuff!
+        assertTrue(true);
+    }
 }
\ No newline at end of file