You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/11/30 23:33:33 UTC
svn commit: r1040815 - in /lucene/dev/trunk/solr/contrib/extraction: ./ lib/
src/main/java/org/apache/solr/handler/extraction/
src/test/java/org/apache/solr/handler/
Author: gsingers
Date: Tue Nov 30 22:33:30 2010
New Revision: 1040815
URL: http://svn.apache.org/viewvc?rev=1040815&view=rev
Log:
SOLR-2241: upgrade to Tika 0.8
Added:
lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar (with props)
lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar (with props)
Removed:
lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.0.jar
lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.1.0.jar
lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.1.0.jar
lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.1.0.jar
lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.6.jar
lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.6.jar
lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar
lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
Modified:
lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Modified: lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt?rev=1040815&r1=1040814&r2=1040815&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt Tue Nov 30 22:33:30 2010
@@ -20,13 +20,13 @@ to your Solr Home lib directory. See ht
Tika Dependency
---------------
-Current Version: Tika 0.8-SNAPSHOT (rev 942725)
+Current Version: Tika 0.8 (released 11/07/2010)
$Id:$
-================== Release 1.5-dev ==================
-
+================== Release 3.1-dev ==================
+* Upgraded to Tika 0.8 and changed deprecated parse call
* SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
Added: lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1040815&r1=1040814&r2=1040815&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Tue Nov 30 22:33:30 2010
@@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentSt
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
@@ -190,7 +191,8 @@ public class ExtractingDocumentLoader ex
} //else leave it as is
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
- parser.parse(inputStream, parsingHandler, metadata);
+ ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+ parser.parse(inputStream, parsingHandler, metadata, context);
if (extractOnly == false) {
addDoc(handler);
} else {
Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=1040815&r1=1040814&r2=1040815&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java Tue Nov 30 22:33:30 2010
@@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTes
@Test
public void testExtraction() throws Exception {
- // broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088
- String defLang = Locale.getDefault().getLanguage();
- assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088", defLang.equals("tr") || defLang.equals("az"));
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ loadLocal("solr-word.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent",
"literal.id", "one",
@@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTes
}
+
@Test
public void testDefaultField() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
@@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTes
loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",