You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/18 13:43:05 UTC
svn commit: r745499 - in /lucene/nutch/trunk: ./ src/plugin/lib-jakarta-poi/
src/plugin/lib-jakarta-poi/lib/ src/plugin/parse-msword/
src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/
src/plugin/parse-msword/src/java/org/apache/nutch/pars...
Author: siren
Date: Wed Feb 18 12:43:04 2009
New Revision: 745499
URL: http://svn.apache.org/viewvc?rev=745499&view=rev
Log:
NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed by Dmitry Lihachev
Added:
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar (with props)
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar (with props)
Removed:
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009
@@ -343,6 +343,9 @@
128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException
(Stefan Will, siren)
+129. NUTCH-691 - Update jakarta poi jars to the most relevant version
+ (Dmitry Lihachev via siren)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 12:43:04 2009
@@ -29,10 +29,10 @@
provider-name="jakarta.apache.org">
<runtime>
- <library name="poi-3.0-alpha1-20050704.jar">
+ <library name="poi-3.5-beta4-20081128.jar">
<export name="*"/>
</library>
- <library name="poi-scratchpad-3.0-alpha1-20050704.jar">
+ <library name="poi-scratchpad-3.5-beta4-20081128.jar">
<export name="*"/>
</library>
</runtime>
Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 2009
@@ -44,7 +44,8 @@
<!-- for junit test -->
<mkdir dir="${build.test}/data"/>
- <copy file="sample/word95.doc" todir="${build.test}/data"/>
- <copy file="sample/word97.doc" todir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample" includes="*.doc" />
+ </copy>
</project>
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java Wed Feb 18 12:43:04 2009
@@ -53,8 +53,9 @@
int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
// get a list of character properties
+
Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
- chpTableSize, fcMin);
+ chpTableSize, fcMin, new TextPieceTable());
List textRuns = chpTable.getTextRuns();
// iterate through the
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java Wed Feb 18 12:43:04 2009
@@ -119,11 +119,12 @@
int chpOffset = LittleEndian.getInt(header, 0xfa);
int chpSize = LittleEndian.getInt(header, 0xfe);
int fcMin = LittleEndian.getInt(header, 0x18);
- CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
// load our text pieces and our character runs
ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
TextPieceTable tpt = cft.getTextPieceTable();
+ CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);
+
List textPieces = tpt.getTextPieces();
// make the POIFS objects available for garbage collection
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java Wed Feb 18 12:43:04 2009
@@ -45,7 +45,7 @@
* @param fcMin The start of text in the main stream.
*/
public Word6CHPBinTable(byte[] documentStream, int offset,
- int size, int fcMin)
+ int size, int fcMin, TextPieceTable tpt)
{
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
@@ -58,7 +58,7 @@
int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum;
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
- pageOffset, fcMin);
+ pageOffset, fcMin, tpt);
int fkpSize = cfkp.size();
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Wed Feb 18 12:43:04 2009
@@ -32,6 +32,9 @@
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
+import java.io.File;
+import java.io.FilenameFilter;
+
import junit.framework.TestCase;
/**
@@ -50,31 +53,38 @@
private String[] sampleFiles = {"word95.doc","word97.doc"};
private String expectedText = "This is a sample doc file prepared for nutch.";
+
+ private Configuration conf;
public TestMSWordParser(String name) {
super(name);
}
- protected void setUp() {}
+ protected void setUp() {
+ conf = NutchConfiguration.create();
+ conf.set("file.content.limit", "-1");
+ }
protected void tearDown() {}
+ public String getTextContent(String fileName) throws ProtocolException, ParseException {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+ Parse parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl());
+ return parse.getText();
+ }
+
public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
- Configuration conf = NutchConfiguration.create();
for (int i=0; i<sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl());
-
- assertTrue(parse.getText().startsWith(expectedText));
+ assertTrue(getTextContent(sampleFiles[i]).startsWith(expectedText));
}
}
+ public void testOpeningDocs() throws ProtocolException, ParseException {
+ String[] filenames = new File(sampleDir).list();
+ for (int i = 0; i < filenames.length; i++) {
+ assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0);
+ }
+ }
}