You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/18 00:22:59 UTC
svn commit: r378653 - in /lucene/nutch/trunk/src/plugin/parse-rtf/src:
java/org/apache/nutch/parse/rtf/RTFParseFactory.java
java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
test/org/apache/nutch/parse/rtf/TestRTFParser.java
Author: jerome
Date: Fri Feb 17 15:22:55 2006
New Revision: 378653
URL: http://svn.apache.org/viewcvs?rev=378653&view=rev
Log:
Adapt parse-rtf to nutch APIs changes (metadata, parse, protocol, ...)
Modified:
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Feb 17 15:22:55 2006
@@ -13,38 +13,42 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.nutch.parse.rtf;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-<<<<<<< .mine
-<<<<<<< .mine
-import org.apache.nutch.util.MetadataNames;
-
-=======
-import org.apache.nutch.util.NutchConf;
-=======
-import org.apache.hadoop.conf.Configuration;
->>>>>>> .r374853
->>>>>>> .r373941
+// JDK imports
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Properties;
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+// RTF Parser imports
+import com.etranslate.tm.processing.rtf.ParseException;
import com.etranslate.tm.processing.rtf.RTFParser;
+
/**
* A parser for RTF documents
*
* @author Andy Hedges
*/
-public class RTFParseFactory implements Parser, MetadataNames {
+public class RTFParseFactory implements Parser {
private Configuration conf;
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
byte[] raw = content.getContent();
Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
@@ -55,28 +59,31 @@
try {
rtfParser.parse();
- } catch (com.etranslate.tm.processing.rtf.ParseException e) {
- throw new ParseException("Exception parsing RTF document", e);
+ } catch (ParseException e) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ e.toString()).getEmptyParse(conf);
}
- Properties metadata = new Properties();
- metadata.putAll(content.getMetadata());
- metadata.putAll(delegate.getMetaData());
- String title = metadata.getProperty(TITLE);
+ Metadata metadata = new Metadata();
+ metadata.setAll(delegate.getMetaData());
+ String title = metadata.get(DublinCore.TITLE);
if (title != null) {
- //(CM): Why remove the title metadata property here? Even
- //though it's stored in the ParseData, it still might be useful
- //to have via this properties object?
- //metadata.remove(title);
+ metadata.remove(DublinCore.TITLE);
} else {
title = "";
}
String text = delegate.getText();
- return new ParseImpl(text, new ParseData(title, OutlinkExtractor
- .getOutlinks(text, this.conf), metadata));
+ return new ParseImpl(text,
+ new ParseData(ParseStatus.STATUS_SUCCESS,
+ title,
+ OutlinkExtractor
+ . getOutlinks(text, this.conf),
+ content.getMetadata(),
+ metadata));
}
public void setConf(Configuration conf) {
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java Fri Feb 17 15:22:55 2006
@@ -16,27 +16,48 @@
package org.apache.nutch.parse.rtf;
+// RTF Parser imports
import com.etranslate.tm.processing.rtf.RTFParserDelegate;
+// JDK imports
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
-import org.apache.nutch.util.MetadataNames;
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Office;
+
/**
* A parser delegate for handling rtf events.
* @author Andy Hedges
*/
-public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames {
+public class RTFParserDelegateImpl implements RTFParserDelegate {
String tabs = "";
Properties metadata = new Properties();
- String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager",
- "company", "operator", "category", KEYWORDS,
- COMMENTS, "doccomm", "hlinkbase"};
- String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"};
+ String[] META_NAMES_TEXT = {
+ DublinCore.TITLE,
+ DublinCore.SUBJECT,
+ Office.AUTHOR,
+ "manager",
+ "company",
+ "operator",
+ "category",
+ Office.KEYWORDS,
+ Office.COMMENTS,
+ "doccomm",
+ "hlinkbase"
+ };
+
+ String[] META_NAMES_DATE = {
+ "creatim",
+ "creatim",
+ "printim",
+ "buptim"
+ };
String metaName = "";
List metaNamesText = Arrays.asList(META_NAMES_TEXT);
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=378653&r1=378652&r2=378653&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Fri Feb 17 15:22:55 2006
@@ -16,33 +16,33 @@
package org.apache.nutch.parse.rtf;
+// JUnit imports
import junit.framework.TestCase;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolFactory;
-<<<<<<< .mine
-<<<<<<< .mine
-import org.apache.nutch.util.MetadataNames;
-=======
-import org.apache.nutch.util.NutchConf;
-=======
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
import org.apache.hadoop.conf.Configuration;
->>>>>>> .r374853
->>>>>>> .r373941
+import org.apache.hadoop.io.UTF8;
-import java.util.Properties;
/**
* Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
*
* @author Andy Hedges
*/
-public class TestRTFParser extends TestCase implements MetadataNames {
+public class TestRTFParser extends TestCase {
private String fileSeparator = System.getProperty("file.separator");
// This system property is defined in ./src/plugin/build-plugin.xml
@@ -72,16 +72,16 @@
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getContent(urlString);
-
+ content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ .getContent();
parse = new ParseUtil(conf).parseByParserId("parse-rtf", content);
String text = parse.getText();
assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
String title = parse.getData().getTitle();
- Properties meta = parse.getData().getMetadata();
+ Metadata meta = parse.getData().getParseMeta();
assertEquals("test rft document", title);
- assertEquals("tests", meta.getProperty(SUBJECT));
+ assertEquals("tests", meta.get(DublinCore.SUBJECT));