You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by gs...@apache.org on 2009/08/08 03:39:16 UTC
svn commit: r802282 - in /lucene/solr/trunk/contrib/extraction: ./
src/main/java/org/apache/solr/handler/extraction/
src/test/java/org/apache/solr/handler/
Author: gsingers
Date: Sat Aug 8 01:39:16 2009
New Revision: 802282
URL: http://svn.apache.org/viewvc?rev=802282&view=rev
Log:
SOLR-1274: added extract only output options
Modified:
lucene/solr/trunk/contrib/extraction/CHANGES.txt
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Modified: lucene/solr/trunk/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/CHANGES.txt?rev=802282&r1=802281&r2=802282&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/extraction/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/extraction/CHANGES.txt Sat Aug 8 01:39:16 2009
@@ -33,4 +33,6 @@
5. SOLR-1310: Upgrade to Tika 0.4. Note there are some differences in detecting Languages now.
See http://www.lucidimagination.com/search/document/d6f1899a85b2a45c/vote_apache_tika_0_4_release_candidate_2#d6f1899a85b2a45c
for discussion on language detection.
- See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers)
\ No newline at end of file
+ See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers)
+
+6. SOLR-1274: Added text serialization output for extractOnly (Peter Wolanin, gsingers)
\ No newline at end of file
Modified: lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=802282&r1=802281&r2=802282&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Sat Aug 8 01:39:16 2009
@@ -38,7 +38,9 @@
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.xml.serialize.OutputFormat;
+import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.TextSerializer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -52,7 +54,14 @@
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
-
+ /**
+ * Extract Only supported format
+ */
+ public static final String TEXT_FORMAT = "text";
+ /**
+ * Extract Only supported format. Default
+ */
+ public static final String XML_FORMAT = "xml";
/**
* XHTML XPath parser.
*/
@@ -152,10 +161,17 @@
ContentHandler parsingHandler = handler;
StringWriter writer = null;
- XMLSerializer serializer = null;
+ BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
+ String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
- serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
+ if (extractFormat.equals(TEXT_FORMAT)) {
+ serializer = new TextSerializer();
+ serializer.setOutputCharStream(writer);
+ serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
+ } else {
+ serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
+ }
if (xpathExpr != null) {
Matcher matcher =
PARSER.parse(xpathExpr);
Modified: lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=802282&r1=802281&r2=802282&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Sat Aug 8 01:39:16 2009
@@ -82,6 +82,11 @@
public static final String EXTRACT_ONLY = "extractOnly";
/**
+ * Content output format if extractOnly is true. Default is "xml", alternative is "text".
+ */
+ public static final String EXTRACT_FORMAT = "extractFormat";
+
+ /**
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
*/
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
Modified: lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=802282&r1=802281&r2=802282&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java (original)
+++ lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java Sat Aug 8 01:39:16 2009
@@ -25,6 +25,7 @@
import org.apache.solr.common.SolrException;
import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
+import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
import java.util.List;
import java.util.ArrayList;
@@ -243,6 +244,24 @@
assertTrue("nl is null and it shouldn't be", nl != null);
Object title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
+ assertTrue(extraction.indexOf("<?xml") != -1);
+
+ rsp = loadLocal("solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
+ ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
+ assertTrue("rsp is null and it shouldn't be", rsp != null);
+ list = rsp.getValues();
+
+ extraction = (String) list.get("solr-word.pdf");
+ assertTrue("extraction is null and it shouldn't be", extraction != null);
+ assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
+ assertTrue(extraction.indexOf("<?xml") == -1);
+
+ nl = (NamedList) list.get("solr-word.pdf_metadata");
+ assertTrue("nl is null and it shouldn't be", nl != null);
+ title = nl.get("title");
+ assertTrue("title is null and it shouldn't be", title != null);
+
+
}