You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/12/01 02:48:29 UTC
svn commit: r1040852 - in /lucene/dev/branches/branch_3x: ./ lucene/ solr/
solr/contrib/extraction/ solr/contrib/extraction/lib/
solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/
solr/contrib/extraction/src/test/java/org/apache/...
Author: gsingers
Date: Wed Dec 1 01:48:28 2010
New Revision: 1040852
URL: http://svn.apache.org/viewvc?rev=1040852&view=rev
Log:
SOLR-2241: upgrade to Tika 0.8
Added:
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/commons-compress-1.1.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/fontbox-1.3.1.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/jempbox-1.3.1.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/netcdf-4.2.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/pdfbox-1.3.1.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-3.7.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-3.7.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/rome-0.9.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-core-0.8.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-parsers-0.8.jar
- copied unchanged from r1040815, lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar
Removed:
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/commons-compress-1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/fontbox-1.1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/jempbox-1.1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/pdfbox-1.1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-3.6.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-3.6.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1040852&r1=1040851&r2=1040852&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Wed Dec 1 01:48:28 2010
@@ -17,21 +17,23 @@ You will need Solr up and running. Then
to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in
and configuring.
-$Id:$
+ Tika Dependency
+ ---------------
+
+Current Version: Tika 0.8 (released 11/07/2010)
-================== Release 1.5-dev ==================
+$Id:$
+================== Release 3.1-dev ==================
-* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers)
+* Upgraded to Tika 0.8 and changed deprecated parse call
* SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
-* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
-
* SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
-* SOLR-1819: Upgraded to Tika 0.7 (gsingers)
+* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers)
================== Release 1.4.0 ==================
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1040852&r1=1040851&r2=1040852&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Dec 1 01:48:28 2010
@@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentSt
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
@@ -191,7 +192,8 @@ public class ExtractingDocumentLoader ex
} //else leave it as is
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
- parser.parse(inputStream, parsingHandler, metadata);
+ ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+ parser.parse(inputStream, parsingHandler, metadata, context);
if (extractOnly == false) {
addDoc(handler);
} else {
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=1040852&r1=1040851&r2=1040852&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java Wed Dec 1 01:48:28 2010
@@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTes
@Test
public void testExtraction() throws Exception {
- // broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088
- String defLang = Locale.getDefault().getLanguage();
- assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088", defLang.equals("tr") || defLang.equals("az"));
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ loadLocal("solr-word.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "one",
@@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTes
}
+
@Test
public void testDefaultField() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
@@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTes
loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",