You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/02/26 15:26:39 UTC
svn commit: r1662457 - in /lucene/dev/trunk/solr: CHANGES.txt
contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
Author: uschindler
Date: Thu Feb 26 14:26:38 2015
New Revision: 1662457
URL: http://svn.apache.org/r1662457
Log:
SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1662457&r1=1662456&r2=1662457&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Feb 26 14:26:38 2015
@@ -156,6 +156,9 @@ Bug Fixes
* SOLR-7128: Two phase distributed search is fetching extra fields in GET_TOP_IDS phase.
(Pablo Queixalos, shalin)
+* SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events.
+ (Chris Mattman, Uwe Schindler)
+
Optimizations
----------------------
Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1662457&r1=1662456&r2=1662457&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java Thu Feb 26 14:26:38 2015
@@ -17,6 +17,18 @@
package org.apache.solr.handler.extraction;
+import java.text.DateFormat;
+import java.util.ArrayDeque;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Date;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DateUtil;
@@ -31,14 +43,13 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import java.text.DateFormat;
-import java.util.*;
-
/**
* The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
* <B>This class is not thread-safe.</B>
* <p>
+ * This class cannot be reused, you have to create a new instance per document!
+ * <p>
* User's may wish to override this class to provide their own functionality.
*
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
@@ -46,27 +57,30 @@ import java.util.*;
* @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
*/
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
- private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
- protected SolrInputDocument document;
+ private transient static final Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
+
+ public static final String contentFieldName = "content";
- protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+ protected final SolrInputDocument document;
- protected Metadata metadata;
- protected SolrParams params;
- protected StringBuilder catchAllBuilder = new StringBuilder(2048);
- protected IndexSchema schema;
- protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
- private LinkedList<StringBuilder> bldrStack = new LinkedList<>();
-
- protected boolean captureAttribs;
- protected boolean lowerNames;
- protected String contentFieldName = "content";
+ protected final Collection<String> dateFormats;
- protected String unknownFieldPrefix = "";
- protected String defaultField = "";
+ protected final Metadata metadata;
+ protected final SolrParams params;
+ protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
+ protected final IndexSchema schema;
+ protected final Map<String, StringBuilder> fieldBuilders;
+ private final Deque<StringBuilder> bldrStack = new ArrayDeque<>();
+
+ protected final boolean captureAttribs;
+ protected final boolean lowerNames;
+
+ protected final String unknownFieldPrefix;
+ protected final String defaultField;
- private boolean literalsOverride;
- private Set<String> literalFieldNames;
+ private final boolean literalsOverride;
+
+ private Set<String> literalFieldNames = null;
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
@@ -75,7 +89,7 @@ public class SolrContentHandler extends
public SolrContentHandler(Metadata metadata, SolrParams params,
IndexSchema schema, Collection<String> dateFormats) {
- document = new SolrInputDocument();
+ this.document = new SolrInputDocument();
this.metadata = metadata;
this.params = params;
this.schema = schema;
@@ -86,12 +100,15 @@ public class SolrContentHandler extends
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
this.defaultField = params.get(DEFAULT_FIELD, "");
+
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap<>();
for (int i = 0; i < captureFields.length; i++) {
fieldBuilders.put(captureFields[i], new StringBuilder());
}
+ } else {
+ fieldBuilders = Collections.emptyMap();
}
bldrStack.add(catchAllBuilder);
}
@@ -253,19 +270,6 @@ public class SolrContentHandler extends
// if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
}
-
- @Override
- public void startDocument() throws SAXException {
- document.clear();
- catchAllBuilder.setLength(0);
- for (StringBuilder builder : fieldBuilders.values()) {
- builder.setLength(0);
- }
- bldrStack.clear();
- bldrStack.add(catchAllBuilder);
- }
-
-
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
StringBuilder theBldr = fieldBuilders.get(localName);