You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/18 13:43:05 UTC

svn commit: r745499 - in /lucene/nutch/trunk: ./ src/plugin/lib-jakarta-poi/ src/plugin/lib-jakarta-poi/lib/ src/plugin/parse-msword/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ src/plugin/parse-msword/src/java/org/apache/nutch/pars...

Author: siren
Date: Wed Feb 18 12:43:04 2009
New Revision: 745499

URL: http://svn.apache.org/viewvc?rev=745499&view=rev
Log:
NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed by Dmitry Lihachev

Added:
    lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar   (with props)
    lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar   (with props)
Removed:
    lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar
    lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
    lucene/nutch/trunk/src/plugin/parse-msword/build.xml
    lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
    lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
    lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java
    lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009
@@ -343,6 +343,9 @@
 128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException
      (Stefan Will, siren)
      
+129. NUTCH-691 - Update jakarta poi jars to the most relevant version
+     (Dmitry Lihachev via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 12:43:04 2009
@@ -29,10 +29,10 @@
    provider-name="jakarta.apache.org">
 
    <runtime>
-     <library name="poi-3.0-alpha1-20050704.jar">
+     <library name="poi-3.5-beta4-20081128.jar">
         <export name="*"/>
      </library>
-     <library name="poi-scratchpad-3.0-alpha1-20050704.jar">
+     <library name="poi-scratchpad-3.5-beta4-20081128.jar">
         <export name="*"/>
      </library>
    </runtime>

Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 2009
@@ -44,7 +44,8 @@
 
   <!-- for junit test -->
   <mkdir dir="${build.test}/data"/>
-  <copy file="sample/word95.doc" todir="${build.test}/data"/>
-  <copy file="sample/word97.doc" todir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="*.doc" />
+  </copy>
 
 </project>

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java Wed Feb 18 12:43:04 2009
@@ -53,8 +53,9 @@
     int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
 
     // get a list of character properties
+    
     Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
-      chpTableSize, fcMin);
+      chpTableSize, fcMin, new TextPieceTable());
     List textRuns = chpTable.getTextRuns();
 
     // iterate through the

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java Wed Feb 18 12:43:04 2009
@@ -119,11 +119,12 @@
     int chpOffset = LittleEndian.getInt(header, 0xfa);
     int chpSize = LittleEndian.getInt(header, 0xfe);
     int fcMin = LittleEndian.getInt(header, 0x18);
-    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
 
     // load our text pieces and our character runs
     ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
     TextPieceTable tpt = cft.getTextPieceTable();
+    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);
+
     List textPieces = tpt.getTextPieces();
 
     // make the POIFS objects available for garbage collection

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java Wed Feb 18 12:43:04 2009
@@ -45,7 +45,7 @@
    * @param fcMin The start of text in the main stream.
    */
   public Word6CHPBinTable(byte[] documentStream, int offset,
-                     int size, int fcMin)
+                     int size, int fcMin, TextPieceTable tpt)
   {
     PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
 
@@ -58,7 +58,7 @@
       int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum;
 
       CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
-        pageOffset, fcMin);
+        pageOffset, fcMin, tpt);
 
       int fkpSize = cfkp.size();
 

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=745499&r1=745498&r2=745499&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Wed Feb 18 12:43:04 2009
@@ -32,6 +32,9 @@
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
+import java.io.File;
+import java.io.FilenameFilter;
+
 import junit.framework.TestCase;
 
 /** 
@@ -50,31 +53,38 @@
   private String[] sampleFiles = {"word95.doc","word97.doc"};
 
   private String expectedText = "This is a sample doc file prepared for nutch.";
+  
+  private Configuration conf;
 
   public TestMSWordParser(String name) { 
     super(name); 
   }
 
-  protected void setUp() {}
+  protected void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
 
   protected void tearDown() {}
 
+  public String getTextContent(String fileName) throws ProtocolException, ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl());
+    return parse.getText();
+  }
+  
   public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    Configuration conf = NutchConfiguration.create();
     for (int i=0; i<sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl());
-
-      assertTrue(parse.getText().startsWith(expectedText));
+      assertTrue(getTextContent(sampleFiles[i]).startsWith(expectedText));
     }
   }
 
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+      for (int i = 0; i < filenames.length; i++) {
+        assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0);
+      }      
+  }
 }