You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/12/01 02:48:29 UTC

svn commit: r1040852 - in /lucene/dev/branches/branch_3x: ./ lucene/ solr/ solr/contrib/extraction/ solr/contrib/extraction/lib/ solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ solr/contrib/extraction/src/test/java/org/apache/...

Author: gsingers
Date: Wed Dec  1 01:48:28 2010
New Revision: 1040852

URL: http://svn.apache.org/viewvc?rev=1040852&view=rev
Log:
SOLR-2241: upgrade to Tika 0.8

Added:
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/commons-compress-1.1.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/fontbox-1.3.1.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/jempbox-1.3.1.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/netcdf-4.2.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/pdfbox-1.3.1.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-3.7.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-3.7.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/rome-0.9.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-core-0.8.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-parsers-0.8.jar
      - copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar
Removed:
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/commons-compress-1.0.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/fontbox-1.1.0.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/jempbox-1.1.0.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/pdfbox-1.1.0.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-3.6.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-3.6.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
    lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1040852&r1=1040851&r2=1040852&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Wed Dec  1 01:48:28 2010
@@ -17,21 +17,23 @@ You will need Solr up and running.  Then
 to your Solr Home lib directory.  See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in
  and configuring.
 
-$Id:$
+ Tika Dependency
+ ---------------
+
+Current Version: Tika 0.8 (released 11/07/2010)
 
-================== Release 1.5-dev ==================
+$Id:$
 
+================== Release 3.1-dev ==================
 
-* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers)
+* Upgraded to Tika 0.8 and changed deprecated parse call
 
 * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
   parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
 
-* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
-
 * SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
 
-* SOLR-1819: Upgraded to Tika 0.7 (gsingers)
+* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers)
 
 ================== Release 1.4.0 ==================
 

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1040852&r1=1040851&r2=1040852&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Dec  1 01:48:28 2010
@@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentSt
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.Matcher;
@@ -191,7 +192,8 @@ public class ExtractingDocumentLoader ex
         } //else leave it as is
 
         //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-        parser.parse(inputStream, parsingHandler, metadata);
+        ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+        parser.parse(inputStream, parsingHandler, metadata, context);
         if (extractOnly == false) {
           addDoc(handler);
         } else {

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=1040852&r1=1040851&r2=1040852&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java Wed Dec  1 01:48:28 2010
@@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTes
 
   @Test
   public void testExtraction() throws Exception {
-    // broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088
-    String defLang = Locale.getDefault().getLanguage();
-    assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088", defLang.equals("tr") || defLang.equals("az"));
     ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
     assertTrue("handler is null and it shouldn't be", handler != null);
-    loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+    loadLocal("solr-word.pdf",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
             "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Creation-Date", "extractedDate",
+            "fmap.AAPL:Keywords", "ignored_a",
+            "fmap.xmpTPg:NPages", "ignored_a",
             "fmap.Author", "extractedAuthor",
             "fmap.content", "extractedContent",
            "literal.id", "one",
@@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTes
 
   }
 
+
   @Test
   public void testDefaultField() throws Exception {
     ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
@@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTes
 
     loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
         "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "fmap.AAPL:Keywords", "ignored_a",
+        "fmap.xmpTPg:NPages", "ignored_a",
         "fmap.Author", "extractedAuthor",
         "fmap.content", "wdf_nocase",
        "literal.id", "one",