You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/02/26 15:30:30 UTC

svn commit: r1662461 - in /lucene/dev/branches/branch_5x: ./ solr/ solr/CHANGES.txt solr/contrib/ solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java

Author: uschindler
Date: Thu Feb 26 14:30:29 2015
New Revision: 1662461

URL: http://svn.apache.org/r1662461
Log:
Merged revision(s) 1662457 from lucene/dev/trunk:
SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1662461&r1=1662460&r2=1662461&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Thu Feb 26 14:30:29 2015
@@ -108,6 +108,9 @@ Bug Fixes
 * SOLR-7128: Two phase distributed search is fetching extra fields in GET_TOP_IDS phase.
   (Pablo Queixalos, shalin)
 
+* SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events.
+  (Chris A. Mattmann, Uwe Schindler)  
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1662461&r1=1662460&r2=1662461&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java Thu Feb 26 14:30:29 2015
@@ -17,6 +17,18 @@
 
 package org.apache.solr.handler.extraction;
 
+import java.text.DateFormat;
+import java.util.ArrayDeque;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Date;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.DateUtil;
@@ -31,14 +43,13 @@ import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import java.text.DateFormat;
-import java.util.*;
-
 
 /**
  * The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
  * <B>This class is not thread-safe.</B>
  * <p>
+ * This class cannot be reused, you have to create a new instance per document!
+ * <p>
  * User's may wish to override this class to provide their own functionality.
  *
  * @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
@@ -46,27 +57,30 @@ import java.util.*;
  * @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
  */
 public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
-  private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
-  protected SolrInputDocument document;
+  private transient static final Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
+
+  public static final String contentFieldName = "content";
 
-  protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+  protected final SolrInputDocument document;
 
-  protected Metadata metadata;
-  protected SolrParams params;
-  protected StringBuilder catchAllBuilder = new StringBuilder(2048);
-  protected IndexSchema schema;
-  protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
-  private LinkedList<StringBuilder> bldrStack = new LinkedList<>();
-
-  protected boolean captureAttribs;
-  protected boolean lowerNames;
-  protected String contentFieldName = "content";
+  protected final Collection<String> dateFormats;
 
-  protected String unknownFieldPrefix = "";
-  protected String defaultField = "";
+  protected final Metadata metadata;
+  protected final SolrParams params;
+  protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
+  protected final IndexSchema schema;
+  protected final Map<String, StringBuilder> fieldBuilders;
+  private final Deque<StringBuilder> bldrStack = new ArrayDeque<>();
+
+  protected final boolean captureAttribs;
+  protected final boolean lowerNames;
+  
+  protected final String unknownFieldPrefix;
+  protected final String defaultField;
 
-  private boolean literalsOverride;
-  private Set<String> literalFieldNames;
+  private final boolean literalsOverride;
+  
+  private Set<String> literalFieldNames = null;
   
   public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
     this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
@@ -75,7 +89,7 @@ public class SolrContentHandler extends
 
   public SolrContentHandler(Metadata metadata, SolrParams params,
                             IndexSchema schema, Collection<String> dateFormats) {
-    document = new SolrInputDocument();
+    this.document = new SolrInputDocument();
     this.metadata = metadata;
     this.params = params;
     this.schema = schema;
@@ -86,12 +100,15 @@ public class SolrContentHandler extends
     this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
     this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
     this.defaultField = params.get(DEFAULT_FIELD, "");
+    
     String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
     if (captureFields != null && captureFields.length > 0) {
       fieldBuilders = new HashMap<>();
       for (int i = 0; i < captureFields.length; i++) {
         fieldBuilders.put(captureFields[i], new StringBuilder());
       }
+    } else {
+      fieldBuilders = Collections.emptyMap();
     }
     bldrStack.add(catchAllBuilder);
   }
@@ -253,19 +270,6 @@ public class SolrContentHandler extends
     // if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
   }
 
-
-  @Override
-  public void startDocument() throws SAXException {
-    document.clear();
-    catchAllBuilder.setLength(0);
-    for (StringBuilder builder : fieldBuilders.values()) {
-      builder.setLength(0);
-    }
-    bldrStack.clear();
-    bldrStack.add(catchAllBuilder);
-  }
-
-
   @Override
   public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
     StringBuilder theBldr = fieldBuilders.get(localName);