You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/18 00:22:59 UTC

svn commit: r378653 - in /lucene/nutch/trunk/src/plugin/parse-rtf/src: java/org/apache/nutch/parse/rtf/RTFParseFactory.java java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java test/org/apache/nutch/parse/rtf/TestRTFParser.java

Author: jerome
Date: Fri Feb 17 15:22:55 2006
New Revision: 378653

URL: http://svn.apache.org/viewcvs?rev=378653&view=rev
Log:
Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)

Modified:
    lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
    lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
    lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java

Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Feb 17 15:22:55 2006
@@ -13,38 +13,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.rtf;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-<<<<<<< .mine
-<<<<<<< .mine
-import org.apache.nutch.util.MetadataNames;
-
-=======
-import org.apache.nutch.util.NutchConf;
-=======
-import org.apache.hadoop.conf.Configuration;
->>>>>>> .r374853
->>>>>>> .r373941
+// JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.Properties;
 
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+// RTF Parser imports
+import com.etranslate.tm.processing.rtf.ParseException;
 import com.etranslate.tm.processing.rtf.RTFParser;
 
+
 /**
  * A parser for RTF documents
  * 
  * @author Andy Hedges
  */
-public class RTFParseFactory implements Parser, MetadataNames {
+public class RTFParseFactory implements Parser {
 
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
     byte[] raw = content.getContent();
     Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
     RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
@@ -55,28 +59,31 @@
 
     try {
       rtfParser.parse();
-    } catch (com.etranslate.tm.processing.rtf.ParseException e) {
-      throw new ParseException("Exception parsing RTF document", e);
+    } catch (ParseException e) {
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_EXCEPTION,
+                               e.toString()).getEmptyParse(conf);
     }
 
-    Properties metadata = new Properties();
-    metadata.putAll(content.getMetadata());
-    metadata.putAll(delegate.getMetaData());
-    String title = metadata.getProperty(TITLE);
+    Metadata metadata = new Metadata();
+    metadata.setAll(delegate.getMetaData());
+    String title = metadata.get(DublinCore.TITLE);
 
     if (title != null) {
-        //(CM): Why remove the title metadata property here? Even 
-        //though it's stored in the ParseData, it still might be useful
-        //to have via this properties object?
-        //metadata.remove(title);
+      metadata.remove(DublinCore.TITLE);
     } else {
       title = "";
     }
 
     String text = delegate.getText();
 
-    return new ParseImpl(text, new ParseData(title, OutlinkExtractor
-        .getOutlinks(text, this.conf), metadata));
+    return new ParseImpl(text,
+                         new ParseData(ParseStatus.STATUS_SUCCESS,
+                                       title,
+                                       OutlinkExtractor
+        .                              getOutlinks(text, this.conf),
+                                       content.getMetadata(),
+                                       metadata));
   }
 
   public void setConf(Configuration conf) {

Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java Fri Feb 17 15:22:55 2006
@@ -16,27 +16,48 @@
 
 package org.apache.nutch.parse.rtf;
 
+// RTF Parser imports
 import com.etranslate.tm.processing.rtf.RTFParserDelegate;
 
+// JDK imports
 import java.util.Arrays;
 import java.util.List;
 import java.util.Properties;
 
-import org.apache.nutch.util.MetadataNames;
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Office;
+
 
 /**
  * A parser delegate for handling rtf events.
  * @author Andy Hedges
  */
-public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames {
+public class RTFParserDelegateImpl implements RTFParserDelegate {
 
   String tabs = "";
   Properties metadata = new Properties();
 
-  String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager",
-                              "company", "operator", "category", KEYWORDS,
-                              COMMENTS, "doccomm", "hlinkbase"};
-  String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"};
+  String[] META_NAMES_TEXT = {
+    DublinCore.TITLE,
+    DublinCore.SUBJECT,
+    Office.AUTHOR,
+    "manager",
+    "company",
+    "operator",
+    "category",
+    Office.KEYWORDS,
+    Office.COMMENTS,
+    "doccomm",
+    "hlinkbase"
+  };
+  
+  String[] META_NAMES_DATE = {
+    "creatim",
+    "creatim",
+    "printim",
+    "buptim"
+  };
 
   String metaName = "";
   List metaNamesText = Arrays.asList(META_NAMES_TEXT);

Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Fri Feb 17 15:22:55 2006
@@ -16,33 +16,33 @@
 
 package org.apache.nutch.parse.rtf;
 
+// JUnit imports
 import junit.framework.TestCase;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
-<<<<<<< .mine
-<<<<<<< .mine
-import org.apache.nutch.util.MetadataNames;
-=======
-import org.apache.nutch.util.NutchConf;
-=======
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
 import org.apache.hadoop.conf.Configuration;
->>>>>>> .r374853
->>>>>>> .r373941
+import org.apache.hadoop.io.UTF8;
 
-import java.util.Properties;
 
 /**
  * Unit tests for TestRTFParser.  (Adapted from John Xing msword unit tests).
  *
  * @author Andy Hedges
  */
-public class TestRTFParser extends TestCase implements MetadataNames {
+public class TestRTFParser extends TestCase {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
@@ -72,16 +72,16 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getContent(urlString);
-
+    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+                      .getContent();
     parse = new ParseUtil(conf).parseByParserId("parse-rtf", content);
     String text = parse.getText();
     assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
 
     String title = parse.getData().getTitle();
-    Properties meta = parse.getData().getMetadata();
+    Metadata meta = parse.getData().getParseMeta();
     assertEquals("test rft document", title);
-    assertEquals("tests", meta.getProperty(SUBJECT));
+    assertEquals("tests", meta.get(DublinCore.SUBJECT));