You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/06/30 12:36:29 UTC
svn commit: r959259 [9/12] - in /nutch/branches/nutchbase: ./ bin/ conf/ contrib/ docs/ ivy/ lib/ lib/jetty-ext/ src/engines/ src/gora/ src/java/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/clustering/ src/java/org/apache/nutch/crawl/...

Added: nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Modified: nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/plugin.xml?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ nutch/branches/nutchbase/src/plugin/nutch-extensionpoints/plugin.xml Wed Jun 30 10:36:20 2010
@@ -18,29 +18,17 @@
 <plugin
    id="nutch-extensionpoints"
    name="the nutch core extension points"
-   version="0.8.0"
+   version="2.0.0"
    provider-name="nutch.org">
 
    <!-- this file hosts all extension points nutch core code offers. 
-   Please not that plugins can define extension points as well to be extendable.-->
-
-<extension-point
-      id="org.apache.nutch.clustering.OnlineClusterer"
-      name="Nutch Online Search Results Clustering Plugin"/>
-
-<extension-point
-      id="org.apache.nutch.indexer.field.FieldFilter"
-      name="Nutch Field Filter"/>
-      
+   Please note that plugins can define extension points as well to be extendable.-->
+     
 <extension-point
       id="org.apache.nutch.indexer.IndexingFilter"
       name="Nutch Indexing Filter"/>
 
 <extension-point
-      id="org.apache.nutch.ontology.Ontology"
-      name="Ontology Model Loader"/>
-
-<extension-point
       id="org.apache.nutch.parse.Parser"
       name="Nutch Content Parser"/>
  
@@ -53,10 +41,6 @@
       name="Nutch Protocol"/>
 
 <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
-
-<extension-point
       id="org.apache.nutch.net.URLFilter"
       name="Nutch URL Filter"/>
 
@@ -65,18 +49,6 @@
       name="Nutch URL Normalizer"/>
 
 <extension-point
-      id="org.apache.nutch.analysis.NutchAnalyzer"
-      name="Nutch Analysis"/>
-
-<extension-point
-      id="org.apache.nutch.searcher.response.ResponseWriter"
-      name="Nutch Search Results Response Writer"/>
-      
-<extension-point
-      id="org.apache.nutch.searcher.Summarizer"
-      name="Nutch Summarizer"/>
-
-<extension-point
       id="org.apache.nutch.scoring.ScoringFilter"
       name="Nutch Scoring"/>
 

Added: nutch/branches/nutchbase/src/plugin/parse-ext/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-ext/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-ext/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-ext/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/branches/nutchbase/src/plugin/parse-html/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-html/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-html/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-html/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

Modified: nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jun 30 10:36:20 2010
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -33,10 +34,10 @@ import java.util.HashSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.avro.util.Utf8;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Metadata;
@@ -45,15 +46,15 @@ import org.apache.nutch.parse.HTMLMetaTa
 import org.apache.nutch.parse.HtmlParseFilters;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseStatusCodes;
+import org.apache.nutch.parse.ParseStatusUtils;
 import org.apache.nutch.parse.Parser;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.EncodingDetector;
 import org.apache.nutch.util.LogUtil;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.hbase.HbaseColumn;
-import org.apache.nutch.util.hbase.TableUtil;
-import org.apache.nutch.util.hbase.WebTableRow;
-import org.apache.nutch.util.hbase.WebTableColumns;
+import org.apache.nutch.util.TableUtil;
 import org.cyberneko.html.parsers.DOMFragmentParser;
 import org.w3c.dom.DOMException;
 import org.w3c.dom.DocumentFragment;
@@ -63,33 +64,33 @@ import org.xml.sax.SAXException;
 public class HtmlParser implements Parser {
   public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html");
 
-  // I used 1000 bytes at first, but  found that some documents have 
-  // meta tag well past the first 1000 bytes. 
+  // I used 1000 bytes at first, but  found that some documents have
+  // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
   private static final int CHUNK_SIZE = 2000;
   private static Pattern metaPattern =
     Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
-                    Pattern.CASE_INSENSITIVE);
+        Pattern.CASE_INSENSITIVE);
   private static Pattern charsetPattern =
     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
-                    Pattern.CASE_INSENSITIVE);
-  
-  private static Collection<HbaseColumn> COLUMNS = new HashSet<HbaseColumn>();
-  
+        Pattern.CASE_INSENSITIVE);
+
+  private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
   static {
-    COLUMNS.add(new HbaseColumn(WebTableColumns.BASE_URL));
+    FIELDS.add(WebPage.Field.BASE_URL);
   }
-  
+
   private String parserImpl;
 
   /**
-   * Given a <code>byte[]</code> representing an html file of an 
-   * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag   
+   * Given a <code>byte[]</code> representing an html file of an
+   * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag
    * from the first <code>CHUNK_SIZE</code> bytes.
    * If there's no meta tag for Content-Type or no charset is specified,
    * <code>null</code> is returned.  <br />
    * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
-   * can't be handled with this. 
+   * can't be handled with this.
    * We need to do something similar to what's done by mozilla
    * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
    * See also http://www.w3.org/TR/REC-xml/#sec-guessing
@@ -99,19 +100,19 @@ public class HtmlParser implements Parse
    */
 
   private static String sniffCharacterEncoding(byte[] content) {
-    int length = content.length < CHUNK_SIZE ? 
-                 content.length : CHUNK_SIZE;
+    int length = content.length < CHUNK_SIZE ?
+        content.length : CHUNK_SIZE;
 
     // We don't care about non-ASCII parts so that it's sufficient
-    // to just inflate each byte to a 16-bit value by padding. 
-    // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into 
-    // {U+0041, U+0082, U+00B7}. 
+    // to just inflate each byte to a 16-bit value by padding.
+    // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
+    // {U+0041, U+0082, U+00B7}.
     String str = "";
     try {
       str = new String(content, 0, length,
-                       Charset.forName("ASCII").toString());
+          Charset.forName("ASCII").toString());
     } catch (UnsupportedEncodingException e) {
-      // code should never come here, but just in case... 
+      // code should never come here, but just in case...
       return null;
     }
 
@@ -119,7 +120,7 @@ public class HtmlParser implements Parse
     String encoding = null;
     if (metaMatcher.find()) {
       Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
-      if (charsetMatcher.find()) 
+      if (charsetMatcher.find())
         encoding = new String(charsetMatcher.group(1));
     }
 
@@ -129,22 +130,22 @@ public class HtmlParser implements Parse
   private String defaultCharEncoding;
 
   private Configuration conf;
-  
+
   private DOMContentUtils utils;
 
   private HtmlParseFilters htmlParseFilters;
-  
+
   private String cachingPolicy;
 
-  public Parse getParse(String url, WebTableRow row) {
+  public Parse getParse(String url, WebPage page) {
     HTMLMetaTags metaTags = new HTMLMetaTags();
 
-    String baseUrl = row.getBaseUrl();
+    String baseUrl = TableUtil.toString(page.getBaseUrl());
     URL base;
     try {
       base = new URL(baseUrl);
     } catch (MalformedURLException e) {
-      return new ParseStatus(e).getEmptyParseHbase(getConf());
+      return ParseStatusUtils.getEmptyParse(e, getConf());
     }
 
     String text = "";
@@ -155,13 +156,13 @@ public class HtmlParser implements Parse
     // parse the content
     DocumentFragment root;
     try {
-      byte[] contentInOctets = row.getContent();
+      byte[] contentInOctets = page.getContent().array();
       InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
 
       EncodingDetector detector = new EncodingDetector(conf);
-      detector.autoDetectClues(row, true);
+      detector.autoDetectClues(page, true);
       detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
-      String encoding = detector.guessEncoding(row, defaultCharEncoding);
+      String encoding = detector.guessEncoding(page, defaultCharEncoding);
 
       metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
       metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
@@ -170,16 +171,16 @@ public class HtmlParser implements Parse
       if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
       root = parse(input);
     } catch (IOException e) {
-      return new ParseStatus(e).getEmptyParseHbase(getConf());
+      return ParseStatusUtils.getEmptyParse(e, getConf());
     } catch (DOMException e) {
-      return new ParseStatus(e).getEmptyParseHbase(getConf());
+      return ParseStatusUtils.getEmptyParse(e, getConf());
     } catch (SAXException e) {
-      return new ParseStatus(e).getEmptyParseHbase(getConf());
+      return ParseStatusUtils.getEmptyParse(e, getConf());
     } catch (Exception e) {
       e.printStackTrace(LogUtil.getWarnStream(LOG));
-      return new ParseStatus(e).getEmptyParseHbase(getConf());
+      return ParseStatusUtils.getEmptyParse(e, getConf());
     }
-      
+
     // get meta directives
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
     if (LOG.isTraceEnabled()) {
@@ -196,7 +197,7 @@ public class HtmlParser implements Parse
       utils.getTitle(sb, root);         // extract title
       title = sb.toString().trim();
     }
-      
+
     if (!metaTags.getNoFollow()) {              // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>();   // extract outlinks
       URL baseTag = utils.getBase(root);
@@ -207,21 +208,23 @@ public class HtmlParser implements Parse
         LOG.trace("found "+outlinks.length+" outlinks in "+ url);
       }
     }
-    
-    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+
+    ParseStatus status = new ParseStatus();
+    status.setMajorCode(ParseStatusCodes.SUCCESS);
     if (metaTags.getRefresh()) {
-      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
-      status.setArgs(new String[] {metaTags.getRefreshHref().toString(),
-        Integer.toString(metaTags.getRefreshTime())});      
+      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
+      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
+      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
     }
-    
+
     Parse parse = new Parse(text, title, outlinks, status);
-    parse = htmlParseFilters.filter(url, row, parse, metaTags, root);
-    
+    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
+
     if (metaTags.getNoCache()) {             // not okay to cache
-      row.putMeta(Nutch.CACHING_FORBIDDEN_KEY, Bytes.toBytes(cachingPolicy));
+      page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
+          ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
     }
-    
+
     return parse;
   }
 
@@ -230,7 +233,7 @@ public class HtmlParser implements Parse
       return parseTagSoup(input);
     else return parseNeko(input);
   }
-  
+
   private DocumentFragment parseTagSoup(InputSource input) throws Exception {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
     DocumentFragment frag = doc.createDocumentFragment();
@@ -243,22 +246,22 @@ public class HtmlParser implements Parse
     reader.parse(input);
     return frag;
   }
-  
+
   private DocumentFragment parseNeko(InputSource input) throws Exception {
     DOMFragmentParser parser = new DOMFragmentParser();
     try {
       parser.setFeature("http://cyberneko.org/html/features/augmentations",
-              true);
+          true);
       parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
-              defaultCharEncoding);
+          defaultCharEncoding);
       parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
-              true);
+          true);
       parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
-              false);
+          false);
       parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
-              true);
+          true);
       parser.setFeature("http://cyberneko.org/html/features/report-errors",
-              LOG.isTraceEnabled());
+          LOG.isTraceEnabled());
     } catch (SAXException e) {}
     // convert Document to DocumentFragment
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -267,7 +270,7 @@ public class HtmlParser implements Parse
     DocumentFragment frag = doc.createDocumentFragment();
     parser.parse(input, frag);
     res.appendChild(frag);
-    
+
     try {
       while(true) {
         frag = doc.createDocumentFragment();
@@ -297,10 +300,11 @@ public class HtmlParser implements Parse
     return this.conf;
   }
 
-  public Collection<HbaseColumn> getColumns() {
-    return COLUMNS;
+  @Override
+  public Collection<WebPage.Field> getFields() {
+    return FIELDS;
   }
-  
+
   public static void main(String[] args) throws Exception {
     //LOG.setLevel(Level.FINE);
     String name = args[0];
@@ -312,15 +316,15 @@ public class HtmlParser implements Parse
     Configuration conf = NutchConfiguration.create();
     HtmlParser parser = new HtmlParser();
     parser.setConf(conf);
-    WebTableRow row = new WebTableRow(Bytes.toBytes(TableUtil.reverseUrl(url)));
-    row.setBaseUrl(url);
-    row.setContent(bytes);
-    row.setContentType("text/html");
-    Parse parse = parser.getParse(url, row);
+    WebPage page = new WebPage();
+    page.setBaseUrl(new Utf8(url));
+    page.setContent(ByteBuffer.wrap(bytes));
+    page.setContentType(new Utf8("text/html"));
+    Parse parse = parser.getParse(url, page);
     System.out.println("title: "+parse.getTitle());
     System.out.println("text: "+parse.getText());
     System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
-    
+
   }
 
 }

Added: nutch/branches/nutchbase/src/plugin/parse-js/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-js/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-js/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-js/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Modified: nutch/branches/nutchbase/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Wed Jun 30 10:36:20 2010
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.parse.js;
 
 import java.io.BufferedReader;
@@ -29,19 +29,18 @@ import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
+import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseStatusCodes;
+import org.apache.nutch.parse.ParseStatusUtils;
 import org.apache.nutch.parse.Parser;
-import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.hbase.HbaseColumn;
-import org.apache.nutch.util.hbase.WebTableRow;
-import org.apache.nutch.util.hbase.WebTableRow;
-import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.TableUtil;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.Pattern;
 import org.apache.oro.text.regex.PatternCompiler;
@@ -69,10 +68,10 @@ public class JSParseFilter implements Ht
   private static final int MAX_TITLE_LEN = 80;
 
   private Configuration conf;
-  
+
   @Override
-  public Parse filter(String url, WebTableRow row, Parse parse,
-    HTMLMetaTags metaTags, DocumentFragment doc) {
+  public Parse filter(String url, WebPage page, Parse parse,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
 
     ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
     walk(doc, parse, metaTags, url, outlinks);
@@ -88,7 +87,7 @@ public class JSParseFilter implements Ht
     }
     return parse;
   }
-  
+
   private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
     if (n instanceof Element) {
       String name = n.getNodeName();
@@ -141,14 +140,14 @@ public class JSParseFilter implements Ht
       walk(nl.item(i), parse, metaTags, base, outlinks);
     }
   }
-  
+
   @Override
-  public Parse getParse(String url, WebTableRow row) {
-    String type = row.getContentType();
+  public Parse getParse(String url, WebPage page) {
+    String type = TableUtil.toString(page.getContentType());
     if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
-      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
-              "Content not JavaScript: '" + type + "'").getEmptyParseHbase(getConf());
-    String script = new String(row.getContent());
+      return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_INVALID_FORMAT,
+          "Content not JavaScript: '" + type + "'", getConf());
+    String script = new String(page.getContent().array());
     Outlink[] outlinks = getJSLinks(script, "", url);
     if (outlinks == null) outlinks = new Outlink[0];
     // Title? use the first line of the script...
@@ -162,16 +161,16 @@ public class JSParseFilter implements Ht
       title = script.substring(0, idx);
     }
     Parse parse =
-      new Parse(script, title, outlinks, ParseStatus.STATUS_SUCCESS);
+      new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS);
     return parse;
   }
-  
+
   private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
   // A simple pattern. This allows also invalid URL characters.
   private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
   // Alternative pattern, which limits valid url characters.
   //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
-  
+
   /**
    *  This method extracts URLs from literals embedded in JavaScript.
    */
@@ -179,7 +178,7 @@ public class JSParseFilter implements Ht
 
     final List<Outlink> outlinks = new ArrayList<Outlink>();
     URL baseURL = null;
-    
+
     try {
       baseURL = new URL(base);
     } catch (Exception e) {
@@ -190,10 +189,10 @@ public class JSParseFilter implements Ht
       final PatternCompiler cp = new Perl5Compiler();
       final Pattern pattern = cp.compile(STRING_PATTERN,
           Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
+          | Perl5Compiler.MULTILINE_MASK);
       final Pattern pattern1 = cp.compile(URI_PATTERN,
-              Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-                  | Perl5Compiler.MULTILINE_MASK);
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+          | Perl5Compiler.MULTILINE_MASK);
       final PatternMatcher matcher = new Perl5Matcher();
 
       final PatternMatcher matcher1 = new Perl5Matcher();
@@ -212,7 +211,7 @@ public class JSParseFilter implements Ht
           continue;
         }
         if (url.startsWith("www.")) {
-            url = "http://" + url;
+          url = "http://" + url;
         } else {
           // See if candidate URL is parseable.  If not, pass and move on to
           // the next match.
@@ -249,7 +248,7 @@ public class JSParseFilter implements Ht
 
     return retval;
   }
-  
+
   public static void main(String[] args) throws Exception {
     if (args.length < 2) {
       System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
@@ -276,8 +275,9 @@ public class JSParseFilter implements Ht
     return this.conf;
   }
 
-  public Collection<HbaseColumn> getColumns() {
-    return EMPTY_COLUMNS;
+  @Override
+  public Collection<WebPage.Field> getFields() {
+    return null;
   }
 
 }

Added: nutch/branches/nutchbase/src/plugin/parse-rss/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-rss/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-rss/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-rss/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="xmlrpc" name="xmlrpc" rev="1.2" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

Modified: nutch/branches/nutchbase/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Wed Jun 30 10:36:20 2010
@@ -19,174 +19,164 @@ package org.apache.nutch.parse.rss;
 
 // JDK imports
 import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Vector;
 
-// Commons Logging imports
+import org.apache.commons.feedparser.FeedParser;
+import org.apache.commons.feedparser.FeedParserFactory;
+import org.apache.commons.feedparser.FeedParserListener;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
-// Hadoop imports
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.rss.structs.RSSItem;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.rss.structs.RSSChannel;
+import org.apache.nutch.parse.rss.structs.RSSItem;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.LogUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
-// RSS parsing imports
-import org.apache.commons.feedparser.FeedParserListener;
-import org.apache.commons.feedparser.FeedParser;
-import org.apache.commons.feedparser.FeedParserFactory;
-
-
 /**
  * 
  * @author mattmann
  * @version 1.0
  * 
- * <p>
- * RSS Parser class for nutch
- * </p>
+ *          <p>
+ *          RSS Parser class for nutch
+ *          </p>
  */
 public class RSSParser implements Parser {
-    public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.rss");
-    private Configuration conf;
+  public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.rss");
+  private Configuration conf;
 
-    /**
-     * <p>
-     * Implementation method, parses the RSS content, and then returns a
-     * {@link ParseImpl}.
-     * </p>
-     * 
-     * @param content
-     *            The content to parse (hopefully an RSS content stream)
-     * @return A {@link ParseImpl}which implements the {@link Parse}interface.
-     */
-    public ParseResult getParse(Content content) {
-
-        List theRSSChannels = null;
-
-        try {
-            byte[] raw = content.getContent();
-
-            // create a new FeedParser...
-            FeedParser parser = FeedParserFactory.newFeedParser();
-
-            // create a listener for handling our callbacks
-            FeedParserListener listener = new FeedParserListenerImpl();
-
-            // start parsing our feed and have the onItem methods called
-            parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
-            null);
-
-            theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();
-
-        } catch (Exception e) { // run time exception
-            if (LOG.isWarnEnabled()) {
-              e.printStackTrace(LogUtil.getWarnStream(LOG));
-              LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
-            }
-            return new ParseStatus(ParseStatus.FAILED,
-                    "Can't be handled as rss document. " + e).getEmptyParseResult(content.getUrl(), getConf());
-        }
+  private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
-        StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
-        List theOutlinks = new Vector();
+  static {
+    FIELDS.add(WebPage.Field.BASE_URL);
+  }
 
-        // for us, the contentTitle will be a concatenation of the titles of the
-        // RSS Channels that we've parsed
-        // and the index text will be a concatenation of the RSS Channel
-        // descriptions, and descriptions of the RSS Items in the channel
-
-        // also get the outlinks
-
-        if (theRSSChannels != null) {
-            for (int i = 0; i < theRSSChannels.size(); i++) {
-                RSSChannel r = (RSSChannel) theRSSChannels.get(i);
-                contentTitle.append(r.getTitle());
-                contentTitle.append(" ");
-
-                // concat the description to the index text
-                indexText.append(r.getDescription());
-                indexText.append(" ");
-
-                if (r.getLink() != null) {
-                        // get the outlink
-			if (r.getDescription()!= null ) {
-			    theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
-			} else {
-			    theOutlinks.add(new Outlink(r.getLink(), ""));
-			}
-                }
-
-                
-
-                // now get the descriptions of all the underlying RSS Items and
-                // then index them too
-                for (int j = 0; j < r.getItems().size(); j++) {
-                    RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
-                    indexText.append(theRSSItem.getDescription());
-                    indexText.append(" ");
-
-                    String whichLink = null;
-
-                    if (theRSSItem.getPermalink() != null)
-                        whichLink = theRSSItem.getPermalink();
-                    else
-                        whichLink = theRSSItem.getLink();
-
-                    if (whichLink != null) {
-			    if (theRSSItem.getDescription()!=null) {
-				theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
-			    } else {
-				theOutlinks.add(new Outlink(whichLink, ""));
-			    }
+  /**
+   * <p>
+   * Implementation method, parses the RSS content, and then returns a
+   * {@link Parse}.
+   * </p>
+   * 
+   * @param content
+   *          The content to parse (hopefully an RSS content stream)
+   * @return A {@link Parse}.
+   */
+  public Parse getParse(String url, WebPage page) {
+
+    List theRSSChannels = null;
+
+    try {
+      byte[] raw = page.getContent().array();
+
+      // create a new FeedParser...
+      FeedParser parser = FeedParserFactory.newFeedParser();
+
+      // create a listener for handling our callbacks
+      FeedParserListener listener = new FeedParserListenerImpl();
+
+      // start parsing our feed and have the onItem methods called
+      parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
+          null);
+
+      theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();
+
+    } catch (Exception e) { // run time exception
+      if (LOG.isWarnEnabled()) {
+        e.printStackTrace(LogUtil.getWarnStream(LOG));
+        LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
+      }
+      return ParseStatusUtils.getEmptyParse(e, getConf());
+    }
 
-                    }
+    StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
+    List<Outlink> theOutlinks = new Vector<Outlink>();
 
-                }
+    // for us, the contentTitle will be a concatenation of the titles of the
+    // RSS Channels that we've parsed
+    // and the index text will be a concatenation of the RSS Channel
+    // descriptions, and descriptions of the RSS Items in the channel
+
+    // also get the outlinks
+
+    if (theRSSChannels != null) {
+      for (int i = 0; i < theRSSChannels.size(); i++) {
+        RSSChannel r = (RSSChannel) theRSSChannels.get(i);
+        contentTitle.append(r.getTitle());
+        contentTitle.append(" ");
+
+        // concat the description to the index text
+        indexText.append(r.getDescription());
+        indexText.append(" ");
+
+        if (r.getLink() != null) {
+          // get the outlink
+          if (r.getDescription() != null) {
+            theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
+          } else {
+            theOutlinks.add(new Outlink(r.getLink(), ""));
+          }
+        }
 
+        // now get the descriptions of all the underlying RSS Items and
+        // then index them too
+        for (int j = 0; j < r.getItems().size(); j++) {
+          RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
+          indexText.append(theRSSItem.getDescription());
+          indexText.append(" ");
+
+          String whichLink = null;
+
+          if (theRSSItem.getPermalink() != null) whichLink = theRSSItem.getPermalink();
+          else whichLink = theRSSItem.getLink();
+
+          if (whichLink != null) {
+            if (theRSSItem.getDescription() != null) {
+              theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
+            } else {
+              theOutlinks.add(new Outlink(whichLink, ""));
             }
 
-            if (LOG.isTraceEnabled()) {
-              LOG.trace("nutch:parse-rss:getParse:indexText=" + indexText);
-              LOG.trace("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
-            }
+          }
 
-        } else if (LOG.isTraceEnabled()) {
-            LOG.trace("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
         }
 
-        // format the outlinks
-        Outlink[] outlinks = (Outlink[]) theOutlinks.toArray(new Outlink[theOutlinks.size()]);
+      }
 
-        if (LOG.isTraceEnabled()) {
-          LOG.trace("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks");
-        }
-        // if (LOG.isInfoEnabled()) {
-        //   LOG.info("Outlinks: "+outlinks);
-        // }
-
-        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
-                contentTitle.toString(), outlinks, content.getMetadata());
-        return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("nutch:parse-rss:getParse:indexText=" + indexText);
+        LOG.trace("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
+      }
+
+    } else if (LOG.isTraceEnabled()) {
+      LOG.trace("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
     }
 
+    // format the outlinks
+    Outlink[] outlinks = theOutlinks.toArray(new Outlink[theOutlinks.size()]);
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks");
+    }
+    // if (LOG.isInfoEnabled()) {
+    // LOG.info("Outlinks: "+outlinks);
+    // }
+
+    return new Parse(indexText.toString(), contentTitle.toString(), outlinks,
+        ParseStatusUtils.STATUS_SUCCESS);
+  }
+
   public void setConf(Configuration conf) {
     this.conf = conf;
   }
@@ -194,19 +184,25 @@ public class RSSParser implements Parser
   public Configuration getConf() {
     return this.conf;
   }
-  
+
+  @Override
+  public Collection<WebPage.Field> getFields() {
+    return FIELDS;
+  }
+
   public static void main(String[] args) throws Exception {
-    //LOG.setLevel(Level.FINE);
+    // LOG.setLevel(Level.FINE);
     String url = args[0];
     Configuration conf = NutchConfiguration.create();
     RSSParser parser = new RSSParser();
     parser.setConf(conf);
     Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
-    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
-    Parse parse = parser.getParse(content).get(content.getUrl());
-    System.out.println("data: "+ parse.getData());
-    System.out.println("text: "+parse.getText());
+    WebPage page = new WebPage();
+    Content c = protocol.getProtocolOutput(url, page).getContent();
+    page.setContent(ByteBuffer.wrap(c.getContent()));
+    Parse parse = parser.getParse(url, page);
+    System.out.println("title: " + parse.getTitle());
+    System.out.println("text: " + parse.getText());
   }
-  
 
 }

Modified: nutch/branches/nutchbase/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Wed Jun 30 10:36:20 2010
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.parse.rss;
 
+import java.nio.ByteBuffer;
+
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.Content;
@@ -27,11 +29,15 @@ import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.Outlink;
+import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.tika.mime.MimeType;
 
 import junit.framework.TestCase;
 
@@ -43,88 +49,89 @@ import junit.framework.TestCase;
  */
 public class TestRSSParser extends TestCase {
 
-    private String fileSeparator = System.getProperty("file.separator");
+  private String fileSeparator = System.getProperty("file.separator");
 
-    // This system property is defined in ./src/plugin/build-plugin.xml
-    private String sampleDir = System.getProperty("test.data", ".");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
 
-    // Make sure sample files are copied to "test.data" as specified in
-    // ./src/plugin/parse-rss/build.xml during plugin compilation.
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rss/build.xml during plugin compilation.
 
-    private String[] sampleFiles = { "rsstest.rss" };
-
-    /**
-     * <p>
-     * Default constructor
-     * </p>
-     * 
-     * @param name
-     *            The name of the RSSParserTest
-     */
-    public TestRSSParser(String name) {
-        super(name);
-    }
+  private String[] sampleFiles = {"rsstest.rss"};
+
+  /**
+   * <p>
+   * Default constructor
+   * </p>
+   * 
+   * @param name
+   *          The name of the RSSParserTest
+   */
+  public TestRSSParser(String name) {
+    super(name);
+  }
+
+  /**
+   * <p>
+   * The test method: tests out the following 2 asserts:
+   * </p>
+   * 
+   * <ul>
+   * <li>There are 3 outlinks read from the sample rss file</li>
+   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+   * file</li>
+   * </ul>
+   */
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    MimeUtil mimeutil = new MimeUtil(conf);
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      WebPage page = new WebPage();
+      Content c = protocol.getProtocolOutput(urlString, page).getContent();
+      page.setContent(ByteBuffer.wrap(c.getContent()));
+      MimeType mtype = mimeutil.getMimeType(urlString);
+      page.setContentType(new Utf8(mtype.getName()));
+
+      parse = new ParseUtil(conf).parse(urlString, page);
+
+      // check that there are 3 outlinks:
+      // http://test.channel.com
+      // http://www-scf.usc.edu/~mattmann/
+      // http://www.nutch.org
+
+      Outlink[] theOutlinks = parse.getOutlinks();
+
+      assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);
+
+      // now check to make sure that those are the two outlinks
+      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+      for (int j = 0; j < theOutlinks.length; j++) {
+        // System.out.println("reading "+theOutlinks[j].getToUrl());
+        if (theOutlinks[j].getToUrl().equals("http://www-scf.usc.edu/~mattmann/")) {
+          hasLink1 = true;
+        }
 
-    /**
-     * <p>
-     * The test method: tests out the following 2 asserts:
-     * </p>
-     * 
-     * <ul>
-     * <li>There are 3 outlinks read from the sample rss file</li>
-     * <li>The 3 outlinks read are in fact the correct outlinks from the sample
-     * file</li>
-     * </ul>
-     */
-    public void testIt() throws ProtocolException, ParseException {
-        String urlString;
-        Protocol protocol;
-        Content content;
-        Parse parse;
-
-        Configuration conf = NutchConfiguration.create();
-        for (int i = 0; i < sampleFiles.length; i++) {
-            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-            protocol = new ProtocolFactory(conf).getProtocol(urlString);
-            content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
-            parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content).get(content.getUrl());
-
-            //check that there are 3 outlinks:
-            //http://test.channel.com
-            //http://www-scf.usc.edu/~mattmann/
-            //http://www.nutch.org
-
-            ParseData theParseData = parse.getData();
-
-            Outlink[] theOutlinks = theParseData.getOutlinks();
-
-            assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);
-
-            //now check to make sure that those are the two outlinks
-            boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
-
-            for (int j = 0; j < theOutlinks.length; j++) {
-                //System.out.println("reading "+theOutlinks[j].getToUrl());
-                if (theOutlinks[j].getToUrl().equals(
-                        "http://www-scf.usc.edu/~mattmann/")) {
-                    hasLink1 = true;
-                }
-
-                if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
-                    hasLink2 = true;
-                }
-
-                if (theOutlinks[j].getToUrl()
-                        .equals("http://test.channel.com/")) {
-                    hasLink3 = true;
-                }
-            }
-
-            if (!hasLink1 || !hasLink2 || !hasLink3) {
-                fail("Outlinks read from sample rss file are not correct!");
-            }
+        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+          hasLink2 = true;
         }
+
+        if (theOutlinks[j].getToUrl().equals("http://test.channel.com/")) {
+          hasLink3 = true;
+        }
+      }
+
+      if (!hasLink1 || !hasLink2 || !hasLink3) {
+        fail("Outlinks read from sample rss file are not correct!");
+      }
     }
+  }
 
 }

Added: nutch/branches/nutchbase/src/plugin/parse-swf/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-swf/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-swf/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-swf/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/branches/nutchbase/src/plugin/parse-tika/build-ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/build-ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/build-ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/build-ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Added: nutch/branches/nutchbase/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/build.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/build.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/build.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+    <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.rtf"/>
+      <include name="*.pdf"/>
+      <include name="ootest.*"/>
+      <include name="*.doc"/>
+    </fileset>
+  </copy>
+  
+
+</project>

Added: nutch/branches/nutchbase/src/plugin/parse-tika/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/ivy.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/ivy.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/ivy.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.poi" name="poi-scratchpad" rev="3.6" conf="*->master"/>
+    <dependency org="org.apache.tika" name="tika-parsers" rev="0.7" conf="*->default"/>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/branches/nutchbase/src/plugin/parse-tika/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/plugin.xml?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/plugin.xml (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/plugin.xml Wed Jun 30 10:36:20 2010
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-tika"
+   name="Tika Parser Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-tika.jar">
+         <export name="*"/>
+      </library>
+
+      <library name="asm-3.1.jar"/>
+      <library name="bcmail-jdk14-136.jar"/>
+      <library name="bcmail-jdk15-1.45.jar"/>
+      <library name="bcprov-jdk14-136.jar"/>           
+      <library name="bcprov-jdk15-1.45.jar"/>
+      <library name="commons-compress-1.0.jar"/>
+      <library name="commons-logging-1.1.1.jar"/>
+      <library name="dom4j-1.6.1.jar"/>
+      <library name="fontbox-1.1.0.jar"/>
+      <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
+      <library name="jempbox-1.1.0.jar"/>
+      <library name="metadata-extractor-2.4.0-beta-1.jar"/>
+      <library name="pdfbox-1.1.0.jar"/>
+      <library name="poi-3.6.jar"/>
+      <library name="poi-ooxml-3.6.jar"/>
+      <library name="poi-ooxml-schemas-3.6.jar"/>
+      <library name="poi-scratchpad-3.6.jar"/>
+      <library name="tagsoup-1.2.jar"/>
+      <library name="tika-parsers-0.7.jar"/>
+      <library name="xml-apis-1.0.b2.jar"/>
+      <library name="xmlbeans-2.3.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+
+   <extension point="org.apache.nutch.parse.Parser"
+              id="org.apache.nutch.parse.tika"
+              name="TikaParser">
+
+      <implementation id="org.apache.nutch.parse.tika.TikaParser"
+                      class="org.apache.nutch.parse.tika.TikaParser">
+       <parameter name="contentType" value="*"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/encrypted.pdf
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/encrypted.pdf?rev=959259&view=auto
==============================================================================
Binary file - no diff available.

Propchange: nutch/branches/nutchbase/src/plugin/parse-tika/sample/encrypted.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/nutch.html
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/nutch.html?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/sample/nutch.html (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/sample/nutch.html Wed Jun 30 10:36:20 2010
@@ -0,0 +1,519 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="lucene">
+<title>Welcome to Nutch!</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://lucene.apache.org/">Lucene</a> &gt; <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://search.lucidimagination.com/p:nutch" method="get" class="roundtopsmall">
+<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
+</div>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/nutch/">Wiki</a>
+</li>
+<li>
+<a class="unselected" href="http://issues.apache.org/jira/browse/Nutch">Jira</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Project</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menupage">
+<div class="menupagetitle">News</div>
+</div>
+<div class="menuitem">
+<a href="about.html">About</a>
+</div>
+<div class="menuitem">
+<a href="credits.html">Credits</a>
+</div>
+<div class="menuitem">
+<a href="http://www.cafepress.com/nutch/">Buy Stuff</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://wiki.apache.org/nutch/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/nutch/">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial (0.7.2)</a>
+</div>
+<div class="menuitem">
+<a href="tutorial8.html">Tutorial (0.8.x)</a>
+</div>
+<div class="menuitem">
+<a href="bot.html">Robot     </a>
+</div>
+<div class="menuitem">
+<a href="i18n.html">i18n</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-1.0/index.html">API Docs (1.0)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-0.9/index.html">API Docs (0.9)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-0.8.x/index.html">API Docs (0.8.x)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs/index.html">API Docs (0.7.2)</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html">API Docs (nightly)</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="release/">Download</a>
+</div>
+<div class="menuitem">
+<a href="nightly.html">Nightly builds</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/hadoop/">Hadoop</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/solr/">Solr</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Welcome to Nutch!</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#News">News</a>
+<ul class="minitoc">
+<li>
+<a href="#14+August+2009+-+Lucene+at+US+ApacheCon">14 August 2009 - Lucene at US ApacheCon</a>
+</li>
+<li>
+<a href="#23+March+2009+-+Apache+Nutch+1.0+Released">23 March 2009 - Apache Nutch 1.0 Released</a>
+</li>
+<li>
+<a href="#09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam">09 February 2009 - Lucene at ApacheCon Europe 2009 in
+			Amsterdam</a>
+</li>
+<li>
+<a href="#2+April+2007%3A+Nutch+0.9+Released">2 April 2007: Nutch 0.9 Released</a>
+</li>
+<li>
+<a href="#24+September+2006%3A+Nutch+0.8.1+Released">24 September 2006: Nutch 0.8.1 Released</a>
+</li>
+<li>
+<a href="#25+July+2006%3A+Nutch+0.8+Released">25 July 2006: Nutch 0.8 Released</a>
+</li>
+<li>
+<a href="#31+March+2006%3A+Nutch+0.7.2+Released">31 March 2006: Nutch 0.7.2 Released</a>
+</li>
+<li>
+<a href="#1+October+2005%3A+Nutch+0.7.1+Released">1 October 2005: Nutch 0.7.1 Released</a>
+</li>
+<li>
+<a href="#17+August+2005%3A+Nutch+0.7+Released">17 August 2005: Nutch 0.7 Released</a>
+</li>
+<li>
+<a href="#June+2005%3A+Nutch+graduates+from+Incubator">June 2005: Nutch graduates from Incubator</a>
+</li>
+<li>
+<a href="#January+2005%3A+Nutch+Joins+Apache+Incubator">January 2005: Nutch Joins Apache Incubator</a>
+</li>
+<li>
+<a href="#September+2004%3A+Creative+Commons+launches+Nutch-based+Search">September 2004: Creative Commons launches Nutch-based Search</a>
+</li>
+<li>
+<a href="#September+2004%3A+Oregon+State+University+switches+to+Nutch">September 2004: Oregon State University switches to Nutch</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+
+    
+<a name="N1000D"></a><a name="News"></a>
+<h2 class="h3">News</h2>
+<div class="section">
+<a name="N10013"></a><a name="14+August+2009+-+Lucene+at+US+ApacheCon"></a>
+<h3 class="h4">14 August 2009 - Lucene at US ApacheCon</h3>
+<p>
+        
+<a href="http://www.us.apachecon.com/c/acus2009/" title="ApacheCon US 2009">
+            <img alt="ApacheCon Logo" class="float-right" src="http://www.apache.org/events/current-event-125x125.png">
+        </a>
+        ApacheCon US is once again in the Bay Area and Lucene is coming
+        along for the ride! The Lucene community has planned two full
+        days of talks, plus a meetup and the usual bevy of training.
+        With a well-balanced mix of first time and veteran ApacheCon
+        speakers, the
+        <a href="http://www.us.apachecon.com/c/acus2009/schedule#lucene">Lucene track</a>
+        at ApacheCon US promises to have something for everyone. Be sure
+        not to miss:
+    </p>
+<p> Training:</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/437">Lucene Boot Camp</a>
+            - A two day training session, Nov. 2nd &amp; 3rd
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/375">Solr Day</a>
+            - A one day training session, Nov. 2nd
+        </li>
+    
+</ul>
+<p>Thursday, Nov. 5th</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/428">Introduction to the Lucene Ecosystem
+            </a>
+            - Grant Ingersoll @ 9:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/461">Lucene Basics and New Features</a>
+            - Michael Busch @ 10:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/331">Apache Solr: Out of the Box</a>
+            - Chris Hostetter @ 14:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/427">Introduction to Nutch</a>
+            - Andrzej Bialecki @ 15:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/430">Lucene and Solr Performance Tuning</a>
+            - Mark Miller @ 16:30
+        </li>
+    
+</ul>
+<p>Friday, Nov. 6th</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/332">Implementing an Information Retrieval
+                Framework for an Organizational Repository</a>
+            - Sithu D Sudarsan @ 9:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/333">Apache Mahout - Going from raw data to
+                Information</a>
+            - Isabel Drost @ 10:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/334">MIME Magic with Apache Tika</a>
+            - Jukka Zitting @ 11:30
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/335">Building Intelligent Search Applications
+                with the Lucene Ecosystem</a>
+            - Ted Dunning @ 14:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/462">Realtime Search</a>
+            - Jason Rutherglen @ 15:00
+        </li>
+    
+</ul>
+<a name="N10091"></a><a name="23+March+2009+-+Apache+Nutch+1.0+Released"></a>
+<h3 class="h4">23 March 2009 - Apache Nutch 1.0 Released</h3>
+<p>The 1.0 release of Nutch is now available. This release includes several major feature improvements
+      such as new indexing framework, new scoring framework, Apache Solr integration just to mention a few.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-1.0.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N100A3"></a><a name="09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam"></a>
+<h3 class="h4">09 February 2009 - Lucene at ApacheCon Europe 2009 in
+			Amsterdam</h3>
+<p>
+			
+<a href="http://www.eu.apachecon.com/c/aceu2009/" title="ApacheCon EU 2009">
+				<img alt="ApacheCon EU 2009 Logo" class="float-right" src="http://www.eu.apachecon.com/page_attachments/0000/0115/125x125_basic.gif">
+			</a>
+
+			Lucene will be extremely well represented at
+			<a href="http://www.eu.apachecon.com/c/aceu2009/">ApacheCon EU 2009</a>
+			in Amsterdam, Netherlands this March 23-27, 2009:
+		</p>
+<ul>
+			
+<li>
+				
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/197">Lucene Boot Camp</a>
+				- A two day training session, March 23 &amp; 24th</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/201">Solr Boot Camp</a> - A one day training session, March 24th</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/136">Introducing Apache Mahout</a> - Grant Ingersoll. March 25th @ 10:30</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/137">Lucene/Solr Case Studies</a> - Erik Hatcher. March 25th @ 11:30</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/138">Advanced Indexing Techniques with Apache Lucene</a> - Michael Busch. March 25th @ 14:00</li>  
+                   
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/251">Apache Solr - A Case Study</a> - Uri Boness. March 26th @ 17:30</li>
+           
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/250">Best of breed - httpd, forrest, solr and droids</a> - Thorsten Scherler. March 27th @ 17:30</li>
+           
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/165">Apache Droids - an intelligent standalone robot framework</a> - Thorsten Scherler. March 26th @ 15:00</li>
+
+               
+</ul>
+<a name="N100EF"></a><a name="2+April+2007%3A+Nutch+0.9+Released"></a>
+<h3 class="h4">2 April 2007: Nutch 0.9 Released</h3>
+<p>The 0.9 release of Nutch is now available. This is the second release of Nutch
+      based entirely on the underlying Hadoop platform. This release includes several critical
+      bug fixes, as well as key speedups described in more detail at 
+      <a href="http://blog.foofactory.fi/2007/03/twice-speed-half-size.html">Sami Siren's blog</a>.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10105"></a><a name="24+September+2006%3A+Nutch+0.8.1+Released"></a>
+<h3 class="h4">24 September 2006: Nutch 0.8.1 Released</h3>
+<p>The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10117"></a><a name="25+July+2006%3A+Nutch+0.8+Released"></a>
+<h3 class="h4">25 July 2006: Nutch 0.8 Released</h3>
+<p>The 0.8 release of Nutch is now available. This is the first release of Nutch based on
+      hadoop architecure. See <a href="http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup">
+      CHANGES.txt</a> for list of changes made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10129"></a><a name="31+March+2006%3A+Nutch+0.7.2+Released"></a>
+<h3 class="h4">31 March 2006: Nutch 0.7.2 Released</h3>
+<p>The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158">
+      CHANGES.txt</a> for details. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1013B"></a><a name="1+October+2005%3A+Nutch+0.7.1+Released"></a>
+<h3 class="h4">1 October 2005: Nutch 0.7.1 Released</h3>
+<p>The 0.7.1 release of Nutch is now available. This is a bug fix release. See
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986">
+      CHANGES.txt</a> for details. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1014D"></a><a name="17+August+2005%3A+Nutch+0.7+Released"></a>
+<h3 class="h4">17 August 2005: Nutch 0.7 Released</h3>
+<p>This is the first Nutch release as an Apache Lucene sub-project. See 
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150">
+      CHANGES.txt</a> for details. The release is available 
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1015F"></a><a name="June+2005%3A+Nutch+graduates+from+Incubator"></a>
+<h3 class="h4">June 2005: Nutch graduates from Incubator</h3>
+<p>Nutch has now graduated from the Apache incubator, and is now
+      a Subproject of Lucene.</p>
+<a name="N10169"></a><a name="January+2005%3A+Nutch+Joins+Apache+Incubator"></a>
+<h3 class="h4">January 2005: Nutch Joins Apache Incubator</h3>
+<p>Nutch is a two-year-old open source project, previously
+        hosted at Sourceforge and backed by its own non-profit
+        organization. The non-profit was founded in order to assign
+        copyright, so that we could retain the right to change the
+        license. We have now determined that the Apache license is the
+        appropriate license for Nutch and no longer require the
+        overhead of an independent non-profit organization. Nutch's
+        board of directors and its developers were both polled and
+        supported the move to the Apache foundation.</p>
+<a name="N10173"></a><a name="September+2004%3A+Creative+Commons+launches+Nutch-based+Search"></a>
+<h3 class="h4">September 2004: Creative Commons launches Nutch-based Search</h3>
+<p>Creative Commons unveiled a beta version of its search
+      engine, which scours the web for text, images, audio, and video
+      free to re-use on certain terms a search refinement offered by
+      no other company or organization.</p>
+<p>See the <a href="http://creativecommons.org/press-releases/entry/5064">Creative
+      Commons Press Release</a> for more details.</p>
+<a name="N10184"></a><a name="September+2004%3A+Oregon+State+University+switches+to+Nutch"></a>
+<h3 class="h4">September 2004: Oregon State University switches to Nutch</h3>
+<p>Oregon State University is converting its searching
+      infrastructure from Googletm to the open source project
+      Nutch. The effort to replace the Googletm will realize
+      significant cost savings for Oregon State University, while
+      promoting both the Nutch Search Engine and transparency in
+      search engine use and management.</p>
+<p>For more details see the announcement by OSU's <a href="http://osuosl.org/news_folder/nutch">Open Source
+      Lab</a>.</p>
+</div>
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.odt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.odt?rev=959259&view=auto
==============================================================================
Binary file - no diff available.

Propchange: nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.odt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.sxw
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.sxw?rev=959259&view=auto
==============================================================================
Binary file - no diff available.

Propchange: nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.sxw
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.txt?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.txt (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/sample/ootest.txt Wed Jun 30 10:36:20 2010
@@ -0,0 +1,30 @@
+ï»¿Abcedfg				?????
+Abcdefg
+Abcdefg
+abcdefg
+
+
+
+
+
+
+
+
+
+
+ http://www.openoffice.org
+
+Title
+Col1
+Col2
+Col3
+head
+Cell1
+Cell2
+Cel3
+total
+TOTAL
+TOTAL
+TOTAL
+
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc gravida vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. Sed nulla. Duis posuere justo eget urna. Proin lorem orci, vestibulum ut, consequat molestie, eleifend a, nibh. Mauris sed lacus. Etiam blandit tincidunt neque. Cras ac sapien. Duis erat.