You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2012/09/21 19:22:27 UTC

svn commit: r1388574 [39/45] - in /lucene/dev/branches/LUCENE-2878: ./ dev-tools/ dev-tools/eclipse/ dev-tools/eclipse/dot.settings/ dev-tools/idea/ dev-tools/idea/.idea/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/ dev-tools/idea/lucene/anal...

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java Fri Sep 21 17:21:34 2012
@@ -37,6 +37,7 @@ import org.apache.solr.common.SolrExcept
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
 import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.cloud.ZkNodeProps;
@@ -99,7 +100,8 @@ public class DistributedUpdateProcessor 
   }
 
   public static final String COMMIT_END_POINT = "commit_end_point";
-
+  public static final String LOG_REPLAY = "log_replay";
+  
   private final SolrQueryRequest req;
   private final SolrQueryResponse rsp;
   private final UpdateRequestProcessor next;
@@ -119,7 +121,7 @@ public class DistributedUpdateProcessor 
   
   private final SchemaField idField;
   
-  private final SolrCmdDistributor cmdDistrib;
+  private SolrCmdDistributor cmdDistrib;
 
   private boolean zkEnabled = false;
 
@@ -161,18 +163,15 @@ public class DistributedUpdateProcessor 
     zkController = req.getCore().getCoreDescriptor().getCoreContainer().getZkController();
     if (zkEnabled) {
       numNodes =  zkController.getZkStateReader().getClusterState().getLiveNodes().size();
+      cmdDistrib = new SolrCmdDistributor(numNodes, coreDesc.getCoreContainer().getZkController().getCmdDistribExecutor());
     }
     //this.rsp = reqInfo != null ? reqInfo.getRsp() : null;
 
-   
-    
     cloudDesc = coreDesc.getCloudDescriptor();
     
     if (cloudDesc != null) {
       collection = cloudDesc.getCollectionName();
     }
-
-    cmdDistrib = new SolrCmdDistributor(numNodes);
   }
 
   private List<Node> setupRequest(int hash) {
@@ -183,15 +182,9 @@ public class DistributedUpdateProcessor 
       // set num nodes
       numNodes = zkController.getClusterState().getLiveNodes().size();
       
-      // the leader is...
-      // TODO: if there is no leader, wait and look again
-      // TODO: we are reading the leader from zk every time - we should cache
-      // this and watch for changes?? Just pull it from ZkController cluster state probably?
       String shardId = getShard(hash, collection, zkController.getClusterState()); // get the right shard based on the hash...
 
       try {
-        // TODO: if we find out we cannot talk to zk anymore, we should probably realize we are not
-        // a leader anymore - we shouldn't accept updates at all??
         ZkCoreNodeProps leaderProps = new ZkCoreNodeProps(zkController.getZkStateReader().getLeaderProps(
             collection, shardId));
         
@@ -201,7 +194,10 @@ public class DistributedUpdateProcessor 
         isLeader = coreNodeName.equals(leaderNodeName);
         
         DistribPhase phase = 
-          DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
+            DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
+       
+        doDefensiveChecks(shardId, phase);
+     
 
         if (DistribPhase.FROMLEADER == phase) {
           // we are coming from the leader, just go local - add no urls
@@ -251,6 +247,21 @@ public class DistributedUpdateProcessor 
     return nodes;
   }
 
+  private void doDefensiveChecks(String shardId, DistribPhase phase) {
+    String from = req.getParams().get("distrib.from");
+    boolean logReplay = req.getParams().getBool(LOG_REPLAY, false);
+    boolean localIsLeader = req.getCore().getCoreDescriptor().getCloudDescriptor().isLeader();
+    if (!logReplay && DistribPhase.FROMLEADER == phase && localIsLeader && from != null) { // from will be null on log replay
+      log.error("Request says it is coming from leader, but we are the leader: " + req.getParamString());
+      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Request says it is coming from leader, but we are the leader");
+    }
+    
+    if (isLeader && !localIsLeader) {
+      log.error("ClusterState says we are the leader, but locally we don't think so");
+      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "ClusterState says we are the leader, but locally we don't think so");
+    }
+  }
+
 
   private String getShard(int hash, String collection, ClusterState clusterState) {
     // ranges should be part of the cloud state and eventually gotten from zk
@@ -310,10 +321,7 @@ public class DistributedUpdateProcessor 
     
     boolean dropCmd = false;
     if (!forwardToLeader) {
-      // clone the original doc
-      SolrInputDocument clonedDoc = cmd.solrDoc.deepCopy();
-      dropCmd = versionAdd(cmd, clonedDoc);
-      cmd.solrDoc = clonedDoc;
+      dropCmd = versionAdd(cmd);
     }
 
     if (dropCmd) {
@@ -323,12 +331,19 @@ public class DistributedUpdateProcessor 
     
     ModifiableSolrParams params = null;
     if (nodes != null) {
+      
       params = new ModifiableSolrParams(req.getParams());
       params.set(DISTRIB_UPDATE_PARAM, 
                  (isLeader ? 
                   DistribPhase.FROMLEADER.toString() : 
                   DistribPhase.TOLEADER.toString()));
+      if (isLeader) {
+        params.set("distrib.from", ZkCoreNodeProps.getCoreUrl(
+            zkController.getBaseUrl(), req.getCore().getName()));
+      }
       params.remove("commit"); // this will be distributed from the local commit
+      params.set("distrib.from", ZkCoreNodeProps.getCoreUrl(
+          zkController.getBaseUrl(), req.getCore().getName()));
       cmdDistrib.distribAdd(cmd, nodes, params);
     }
     
@@ -378,9 +393,11 @@ public class DistributedUpdateProcessor 
 
     // TODO: we should do this in the background it would seem
     for (SolrCmdDistributor.Error error : response.errors) {
-      if (error.node instanceof RetryNode) {
+      if (error.node instanceof RetryNode || error.e instanceof SolrException) {
         // we don't try to force a leader to recover
         // when we cannot forward to it
+        // and we assume SolrException means
+        // the node went down
         continue;
       }
       // TODO: we should force their state to recovering ??
@@ -421,11 +438,10 @@ public class DistributedUpdateProcessor 
 
   /**
    * @param cmd
-   * @param cloneDoc needs the version if it's assigned
    * @return whether or not to drop this cmd
    * @throws IOException
    */
-  private boolean versionAdd(AddUpdateCommand cmd, SolrInputDocument cloneDoc) throws IOException {
+  private boolean versionAdd(AddUpdateCommand cmd) throws IOException {
     BytesRef idBytes = cmd.getIndexedId();
 
     if (vinfo == null || idBytes == null) {
@@ -498,7 +514,6 @@ public class DistributedUpdateProcessor 
             long version = vinfo.getNewClock();
             cmd.setVersion(version);
             cmd.getSolrInputDocument().setField(VersionInfo.VERSION_FIELD, version);
-            cloneDoc.setField(VersionInfo.VERSION_FIELD, version);
             bucket.updateHighest(version);
           } else {
             // The leader forwarded us this update.
@@ -530,9 +545,20 @@ public class DistributedUpdateProcessor 
             }
           }
         }
-
+        
+        boolean willDistrib = isLeader && nodes != null && nodes.size() > 0;
+        
+        SolrInputDocument clonedDoc = null;
+        if (willDistrib) {
+          clonedDoc = cmd.solrDoc.deepCopy();
+        }
+        
         // TODO: possibly set checkDeleteByQueries as a flag on the command?
         doLocalAdd(cmd);
+        
+        if (willDistrib) {
+          cmd.solrDoc = clonedDoc;
+        }
 
       }  // end synchronized (bucket)
     } finally {
@@ -653,11 +679,16 @@ public class DistributedUpdateProcessor 
 
     ModifiableSolrParams params = null;
     if (nodes != null) {
+      
       params = new ModifiableSolrParams(req.getParams());
       params.set(DISTRIB_UPDATE_PARAM, 
                  (isLeader ? 
                   DistribPhase.FROMLEADER.toString() : 
                   DistribPhase.TOLEADER.toString()));
+      if (isLeader) {
+        params.set("distrib.from", ZkCoreNodeProps.getCoreUrl(
+            zkController.getBaseUrl(), req.getCore().getName()));
+      }
       params.remove("commit"); // we already will have forwarded this from our local commit
       cmdDistrib.distribDelete(cmd, nodes, params);
     }
@@ -715,7 +746,7 @@ public class DistributedUpdateProcessor 
         try {
           leaderProps = zkController.getZkStateReader().getLeaderProps(collection, sliceName);
         } catch (InterruptedException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, "Exception finding leader for shard " + sliceName, e);
+          throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + sliceName, e);
         }
 
         // TODO: What if leaders changed in the meantime?
@@ -819,6 +850,8 @@ public class DistributedUpdateProcessor 
       ModifiableSolrParams params = new ModifiableSolrParams(req.getParams());
       params.set(VERSION_FIELD, Long.toString(cmd.getVersion()));
       params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
+      params.set("update.from", ZkCoreNodeProps.getCoreUrl(
+          zkController.getBaseUrl(), req.getCore().getName()));
       cmdDistrib.distribDelete(cmd, replicas, params);
       cmdDistrib.finish();
     }
@@ -835,11 +868,14 @@ public class DistributedUpdateProcessor 
 
 
   private void zkCheck() {
-    int retries = 10;
-    while (!zkController.isConnected()) {
-      
-      if (retries-- == 0) {
-        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Cannot talk to ZooKeeper - Updates are disabled.");
+    if (zkController.isConnected()) {
+      return;
+    }
+    
+    long timeoutAt = System.currentTimeMillis() + zkController.getClientTimeout();
+    while (System.currentTimeMillis() < timeoutAt) {
+      if (zkController.isConnected()) {
+        return;
       }
       try {
         Thread.sleep(100);
@@ -848,7 +884,7 @@ public class DistributedUpdateProcessor 
         break;
       }
     }
-    
+    throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Cannot talk to ZooKeeper - Updates are disabled.");
   }
 
   private boolean versionDelete(DeleteUpdateCommand cmd) throws IOException {
@@ -989,7 +1025,7 @@ public class DistributedUpdateProcessor 
   
   @Override
   public void finish() throws IOException {
-    doFinish();
+    if (zkEnabled) doFinish();
     
     if (next != null && nodes == null) next.finish();
   }
@@ -1008,9 +1044,9 @@ public class DistributedUpdateProcessor 
     for (Map.Entry<String,Slice> sliceEntry : slices.entrySet()) {
       Slice replicas = slices.get(sliceEntry.getKey());
 
-      Map<String,ZkNodeProps> shardMap = replicas.getShards();
+      Map<String,Replica> shardMap = replicas.getReplicasMap();
       
-      for (Entry<String,ZkNodeProps> entry : shardMap.entrySet()) {
+      for (Entry<String,Replica> entry : shardMap.entrySet()) {
         ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(entry.getValue());
         if (clusterState.liveNodesContain(nodeProps.getNodeName()) && !entry.getKey().equals(shardZkNodeName)) {
           urls.add(new StdNode(nodeProps));
@@ -1027,13 +1063,13 @@ public class DistributedUpdateProcessor 
   // make the hash pluggable of course.
   // The hash also needs to be pluggable
   private int hash(AddUpdateCommand cmd) {
-    BytesRef br = cmd.getIndexedId();
-    return Hash.murmurhash3_x86_32(br.bytes, br.offset, br.length, 0);
+    String hashableId = cmd.getHashableId();
+    
+    return Hash.murmurhash3_x86_32(hashableId, 0, hashableId.length(), 0);
   }
   
   private int hash(DeleteUpdateCommand cmd) {
-    BytesRef br = cmd.getIndexedId();
-    return Hash.murmurhash3_x86_32(br.bytes, br.offset, br.length, 0);
+    return Hash.murmurhash3_x86_32(cmd.getId(), 0, cmd.getId().length(), 0);
   }
   
   // RetryNodes are used in the case of 'forward to leader' where we want

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java Fri Sep 21 17:21:34 2012
@@ -68,7 +68,7 @@ import org.apache.solr.util.plugin.SolrC
  * containing any of the above criteria, identifying fields to be excluded 
  * from seelction even if they match the selection criteria.  As with the main 
  * selection critiera a field must match all of criteria in a single exclusion 
- * in order to be excluded, but multiple exclusions may be specified to get an 
+ * in order to be excluded, but multiple exclusions may be specified to get an
  * <code>OR</code> behavior
  * </p>
  *

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/DOMUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/DOMUtil.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/DOMUtil.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/DOMUtil.java Fri Sep 21 17:21:34 2012
@@ -31,6 +31,8 @@ import org.w3c.dom.NodeList;
  */
 public class DOMUtil {
 
+  public static final String XML_RESERVED_PREFIX = "xml";
+
   public static Map<String,String> toMap(NamedNodeMap attrs) {
     return toMapExcept(attrs);
   }
@@ -39,6 +41,10 @@ public class DOMUtil {
     Map<String,String> args = new HashMap<String,String>();
     outer: for (int j=0; j<attrs.getLength(); j++) {
       Node attr = attrs.item(j);
+
+      // automaticly exclude things in the xml namespace, ie: xml:base
+      if (XML_RESERVED_PREFIX.equals(attr.getPrefix())) continue outer;
+
       String attrName = attr.getNodeName();
       for (String ex : exclusions)
         if (ex.equals(attrName)) continue outer;

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/FastWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/FastWriter.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/FastWriter.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/FastWriter.java Fri Sep 21 17:21:34 2012
@@ -28,7 +28,7 @@ public class FastWriter extends Writer {
   // it won't cause double buffering.
   private static final int BUFSIZE = 8192;
   protected final Writer sink;
-  protected final char[] buf;
+  protected char[] buf;
   protected int pos;
 
   public FastWriter(Writer w) {
@@ -69,42 +69,64 @@ public class FastWriter extends Writer {
   }
 
   @Override
-  public void write(char cbuf[], int off, int len) throws IOException {
-    int space = buf.length - pos;
-    if (len < space) {
-      System.arraycopy(cbuf, off, buf, pos, len);
-      pos += len;
-    } else if (len<BUFSIZE) {
-      // if the data to write is small enough, buffer it.
-      System.arraycopy(cbuf, off, buf, pos, space);
+  public void write(char arr[], int off, int len) throws IOException {
+    for(;;) {
+      int space = buf.length - pos;
+
+      if (len <= space) {
+        System.arraycopy(arr, off, buf, pos, len);
+        pos += len;
+        return;
+      } else if (len > buf.length) {
+        if (pos>0) {
+          flush(buf,0,pos);  // flush
+          pos=0;
+        }
+        // don't buffer, just write to sink
+        flush(arr, off, len);
+        return;
+      }
+
+      // buffer is too big to fit in the free space, but
+      // not big enough to warrant writing on its own.
+      // write whatever we can fit, then flush and iterate.
+
+      System.arraycopy(arr, off, buf, pos, space);
       flush(buf, 0, buf.length);
-      pos = len-space;
-      System.arraycopy(cbuf, off+space, buf, 0, pos);
-    } else {
-      flush(buf,0,pos);  // flush
-      pos=0;
-      // don't buffer, just write to sink
-      flush(cbuf, off, len);
+      pos = 0;
+      off += space;
+      len -= space;
     }
   }
 
   @Override
   public void write(String str, int off, int len) throws IOException {
-    int space = buf.length - pos;
-    if (len < space) {
-      str.getChars(off, off+len, buf, pos);
-      pos += len;
-    } else if (len<BUFSIZE) {
-      // if the data to write is small enough, buffer it.
+    for(;;) {
+      int space = buf.length - pos;
+
+      if (len <= space) {
+        str.getChars(off, off+len, buf, pos);
+        pos += len;
+        return;
+      } else if (len > buf.length) {
+        if (pos>0) {
+          flush(buf,0,pos);  // flush
+          pos=0;
+        }
+        // don't buffer, just write to sink
+        flush(str, off, len);
+        return;
+      }
+
+      // buffer is too big to fit in the free space, but
+      // not big enough to warrant writing on its own.
+      // write whatever we can fit, then flush and iterate.
+
       str.getChars(off, off+space, buf, pos);
       flush(buf, 0, buf.length);
-      str.getChars(off+space, off+len, buf, 0);
-      pos = len-space;
-    } else {
-      flush(buf,0,pos);  // flush
-      pos=0;
-      // don't buffer, just write to sink
-      flush(str, off, len);
+      pos = 0;
+      off += space;
+      len -= space;
     }
   }
 

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/LongPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/LongPriorityQueue.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/LongPriorityQueue.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/LongPriorityQueue.java Fri Sep 21 17:21:34 2012
@@ -152,10 +152,10 @@ public class LongPriorityQueue {
     time.  Only valid if size() > 0.
    */
   public long pop() {
-    long result = heap[1];	          // save first value
-    heap[1] = heap[size];	          // move last to first
+    long result = heap[1];            // save first value
+    heap[1] = heap[size];            // move last to first
     size--;
-    downHeap();				  // adjust heap
+    downHeap();          // adjust heap
     return result;
   }
   
@@ -187,11 +187,11 @@ public class LongPriorityQueue {
    */
   public long[] sort(int n) {
     while (--n >= 0) {
-      long result = heap[1];	          // save first value
-      heap[1] = heap[size];	          // move last to first
+      long result = heap[1];            // save first value
+      heap[1] = heap[size];            // move last to first
       heap[size] = result;                  // place it last
       size--;
-      downHeap();				  // adjust heap
+      downHeap();          // adjust heap
     }
     return heap;
   }
@@ -203,26 +203,26 @@ public class LongPriorityQueue {
 
   private void upHeap() {
     int i = size;
-    long node = heap[i];			  // save bottom node
+    long node = heap[i];        // save bottom node
     int j = i >>> 1;
     while (j > 0 && node < heap[j]) {
-      heap[i] = heap[j];			  // shift parents down
+      heap[i] = heap[j];        // shift parents down
       i = j;
       j = j >>> 1;
     }
-    heap[i] = node;				  // install saved node
+    heap[i] = node;          // install saved node
   }
 
   private void downHeap() {
     int i = 1;
-    long node = heap[i];			  // save top node
-    int j = i << 1;				  // find smaller child
+    long node = heap[i];        // save top node
+    int j = i << 1;          // find smaller child
     int k = j + 1;
     if (k <= size && heap[k] < heap[j]) {
       j = k;
     }
     while (j <= size && heap[j] < node) {
-      heap[i] = heap[j];			  // shift up child
+      heap[i] = heap[j];        // shift up child
       i = j;
       j = i << 1;
       k = j + 1;
@@ -230,6 +230,6 @@ public class LongPriorityQueue {
         j = k;
       }
     }
-    heap[i] = node;				  // install saved node
+    heap[i] = node;          // install saved node
   }
 }

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SimplePostTool.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SimplePostTool.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SimplePostTool.java Fri Sep 21 17:21:34 2012
@@ -17,65 +17,110 @@ package org.apache.solr.util;
  * limitations under the License.
  */
 
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.ByteArrayInputStream;
+import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
-import java.util.Locale;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 import java.util.HashSet;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.ProtocolException;
 import java.net.URL;
 import java.net.URLEncoder;
 
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
 /**
  * A simple utility class for posting raw updates to a Solr server, 
  * has a main method so it can be run on the command line.
+ * View this not as a best-practice code example, but as a standalone 
+ * example built with an explicit purpose of not having external
+ * jar dependencies.
  */
 public class SimplePostTool {
-  public static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
-  public static final String VERSION_OF_THIS_TOOL = "1.5";
+  private static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
+  private static final String VERSION_OF_THIS_TOOL = "1.5";
 
   private static final String DEFAULT_COMMIT = "yes";
   private static final String DEFAULT_OPTIMIZE = "no";
   private static final String DEFAULT_OUT = "no";
   private static final String DEFAULT_AUTO = "no";
-  private static final String DEFAULT_RECURSIVE = "no";
-
+  private static final String DEFAULT_RECURSIVE = "0";
+  private static final int DEFAULT_WEB_DELAY = 10;
+  private static final int MAX_WEB_DEPTH = 10;
   private static final String DEFAULT_CONTENT_TYPE = "application/xml";
   private static final String DEFAULT_FILE_TYPES = "xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log"; 
 
-  private static final String DATA_MODE_FILES = "files";
-  private static final String DATA_MODE_ARGS = "args";
-  private static final String DATA_MODE_STDIN = "stdin";
-  private static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
-
-  private static final String TRUE_STRINGS = "true,on,yes,1"; 
-
-  private boolean auto = false;
-  private boolean recursive = false;
-  private String fileTypes;
-  
-  private static HashMap<String,String> mimeMap;
-  private GlobFileFilter globFileFilter;
-  
-  private static final Set<String> DATA_MODES = new HashSet<String>();
-  private static final String USAGE_STRING_SHORT =
-      "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|arg> [<file|folder|arg>...]]";
+  static final String DATA_MODE_FILES = "files";
+  static final String DATA_MODE_ARGS = "args";
+  static final String DATA_MODE_STDIN = "stdin";
+  static final String DATA_MODE_WEB = "web";
+  static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
+
+  // Input args
+  boolean auto = false;
+  int recursive = 0;
+  int delay = 0;
+  String fileTypes;
+  URL solrUrl;
+  OutputStream out = null;
+  String type;
+  String mode;
+  boolean commit;
+  boolean optimize;
+  String[] args;
+
+  private int currentDepth;
+
+  static HashMap<String,String> mimeMap;
+  GlobFileFilter globFileFilter;
+  // Backlog for crawling
+  List<LinkedHashSet<URL>> backlog = new ArrayList<LinkedHashSet<URL>>();
+  Set<URL> visited = new HashSet<URL>();
+  
+  static final Set<String> DATA_MODES = new HashSet<String>();
+  static final String USAGE_STRING_SHORT =
+      "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|url|arg> [<file|folder|url|arg>...]]";
+
+  // Used in tests to avoid doing actual network traffic
+  static boolean mockMode = false;
+  static PageFetcher pageFetcher;
 
   static {
     DATA_MODES.add(DATA_MODE_FILES);
     DATA_MODES.add(DATA_MODE_ARGS);
     DATA_MODES.add(DATA_MODE_STDIN);
+    DATA_MODES.add(DATA_MODE_WEB);
     
     mimeMap = new HashMap<String,String>();
     mimeMap.put("xml", "text/xml");
@@ -100,97 +145,196 @@ public class SimplePostTool {
     mimeMap.put("txt", "text/plain");
     mimeMap.put("log", "text/plain");
   }
-
-  protected URL solrUrl;
   
+  /**
+   * See usage() for valid command line usage
+   * @param args the params on the command line
+   */
   public static void main(String[] args) {
     info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
-
     if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
       usage();
+    } else {
+      final SimplePostTool t = parseArgsAndInit(args);
+      t.execute();
+    }
+  }
+
+  /**
+   * After initialization, call execute to start the post job.
+   * This method delegates to the correct mode method.
+   */
+  public void execute() {
+    if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
+      doFilesMode();
+    } else if(DATA_MODE_ARGS.equals(mode) && args.length > 0) {
+      doArgsMode();
+    } else if(DATA_MODE_WEB.equals(mode) && args.length > 0) {
+      doWebMode();
+    } else if(DATA_MODE_STDIN.equals(mode)) {
+      doStdinMode();
+    } else {
+      usageShort();
       return;
     }
     
-    OutputStream out = null;
-    final String type = System.getProperty("type");
-
-    final String params = System.getProperty("params", "");
-
-    URL u = null;
+    if (commit)   commit();
+    if (optimize) optimize();
+  }
+  
+  /**
+   * Parses incoming arguments and system params and initializes the tool
+   * @param args the incoming cmd line args
+   * @return an instance of SimplePostTool
+   */
+  protected static SimplePostTool parseArgsAndInit(String[] args) {
+    String urlStr = null;
     try {
-      u = new URL(System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params)));
+      // Parse args
+      final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
+      if (! DATA_MODES.contains(mode)) {
+        fatal("System Property 'data' is not valid for this tool: " + mode);
+      }
+      String params = System.getProperty("params", "");
+      urlStr = System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params));
+      URL url = new URL(urlStr);
+      boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
+      String type = System.getProperty("type");
+      // Recursive
+      int recursive = 0;
+      String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
+      try {
+        recursive = Integer.parseInt(r);
+      } catch(Exception e) {
+        if (isOn(r))
+          recursive = DATA_MODE_WEB.equals(mode)?1:999;
+      }
+      // Delay
+      int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
+      try {
+        delay = Integer.parseInt(System.getProperty("delay", ""+delay));
+      } catch(Exception e) { }
+      OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
+      String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
+      boolean commit = isOn(System.getProperty("commit",DEFAULT_COMMIT));
+      boolean optimize = isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE));
+      
+      return new SimplePostTool(mode, url, auto, type, recursive, delay, fileTypes, out, commit, optimize, args);
     } catch (MalformedURLException e) {
-      fatal("System Property 'url' is not a valid URL: " + u);
+      fatal("System Property 'url' is not a valid URL: " + urlStr);
+      return null;
     }
-    final SimplePostTool t = new SimplePostTool(u);
+  }
 
-    if (isOn(System.getProperty("auto", DEFAULT_AUTO))) {
-      t.setAuto(true);
-    }
-    
-    if (isOn(System.getProperty("recursive", DEFAULT_RECURSIVE))) {
-      t.setRecursive(true);
-    }
+  /**
+   * Constructor which takes in all mandatory input for the tool to work.
+   * Also see usage() for further explanation of the params.
+   * @param mode whether to post files, web pages, params or stdin
+   * @param url the Solr base Url to post to, should end with /update
+   * @param auto if true, we'll guess type and add resourcename/url
+   * @param type content-type of the data you are posting
+   * @param recursive number of levels for file/web mode, or 0 if one file only
+   * @param delay if recursive then delay will be the wait time between posts
+   * @param fileTypes a comma separated list of file-name endings to accept for file/web
+   * @param out an OutputStream to write output to, e.g. stdout to print to console
+   * @param commit if true, will commit at end of posting
+   * @param optimize if true, will optimize at end of posting
+   * @param args a String[] of arguments, varies between modes
+   */
+  public SimplePostTool(String mode, URL url, boolean auto, String type,
+      int recursive, int delay, String fileTypes, OutputStream out, 
+      boolean commit, boolean optimize, String[] args) {
+    this.mode = mode;
+    this.solrUrl = url;
+    this.auto = auto;
+    this.type = type;
+    this.recursive = recursive;
+    this.delay = delay;
+    this.fileTypes = fileTypes;
+    this.globFileFilter = getFileFilterFromFileTypes(fileTypes);
+    this.out = out;
+    this.commit = commit;
+    this.optimize = optimize;
+    this.args = args;
+    pageFetcher = new PageFetcher();
+  }
 
-    final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
-    if (! DATA_MODES.contains(mode)) {
-      fatal("System Property 'data' is not valid for this tool: " + mode);
+  public SimplePostTool() {}
+  
+  //
+  // Do some action depending on which mode we have
+  //
+  private void doFilesMode() {
+    currentDepth = 0;
+    // Skip posting files if special param "-" given  
+    if (!args[0].equals("-")) {
+      info("Posting files to base url " + solrUrl + (!auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
+      if(auto)
+        info("Entering auto mode. File endings considered are "+fileTypes);
+      if(recursive > 0)
+        info("Entering recursive mode, max depth="+recursive+", delay="+delay+"s"); 
+      int numFilesPosted = postFiles(args, 0, out, type);
+      info(numFilesPosted + " files indexed.");
     }
+  }
 
-    if (isOn(System.getProperty("out", DEFAULT_OUT))) {
-      out = System.out;
+  private void doArgsMode() {
+    info("POSTing args to " + solrUrl + "..");
+    for (String a : args) {
+      postData(stringToStream(a), null, out, type, solrUrl);
     }
-    
-    t.setFileTypes(System.getProperty("filetypes", DEFAULT_FILE_TYPES));
+  }
 
-    int numFilesPosted = 0;
-    
+  private int doWebMode() {
+    reset();
+    int numPagesPosted = 0;
     try {
-      if (DATA_MODE_FILES.equals(mode)) {
-        if (0 < args.length) {
-          // Skip posting files if special param "-" given  
-          if (!args[0].equals("-")) {
-            info("Posting files to base url " + u + (!t.auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
-            if(t.auto)
-              info("Entering auto mode. File endings considered are "+t.getFileTypes());
-            if(t.recursive)
-              info("Entering recursive mode"); 
-            numFilesPosted = t.postFiles(args, 0, out, type);
-            info(numFilesPosted + " files indexed.");
-          }
-        } else {
-            usageShort();
-            return;
-        }
-      } else if (DATA_MODE_ARGS.equals(mode)) {
-        if (0 < args.length) {
-          info("POSTing args to " + u + "..");
-          for (String a : args) {
-            t.postData(SimplePostTool.stringToStream(a), null, out, type);
-          }
-        } else {
-          usageShort();
-          return;
+      if(type != null) {
+        fatal("Specifying content-type with \"-Ddata=web\" is not supported");
+      }
+      if (args[0].equals("-")) {
+        // Skip posting url if special param "-" given  
+        return 0;
+      }
+      // Set Extracting handler as default
+      solrUrl = appendUrlPath(solrUrl, "/extract");
+      
+      info("Posting web pages to Solr url "+solrUrl);
+      auto=true;
+      info("Entering auto mode. Indexing pages with content-types corresponding to file endings "+fileTypes);
+      if(recursive > 0) {
+        if(recursive > MAX_WEB_DEPTH) {
+          recursive = MAX_WEB_DEPTH;
+          warn("Too large recursion depth for web mode, limiting to "+MAX_WEB_DEPTH+"...");
         }
-      } else if (DATA_MODE_STDIN.equals(mode)) {
-        info("POSTing stdin to " + u + "..");
-        t.postData(System.in, null, out, type);
-      }
-      if (isOn(System.getProperty("commit",DEFAULT_COMMIT))) {
-        info("COMMITting Solr index changes to " + u + "..");
-        t.commit();
-      }
-      if (isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE))) {
-        info("Performing an OPTIMIZE to " + u + "..");
-        t.optimize();
+        if(delay < DEFAULT_WEB_DELAY)
+          warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
+        info("Entering recursive mode, depth="+recursive+", delay="+delay+"s");
       }
-    
-    } catch(RuntimeException e) {
-      e.printStackTrace();
-      fatal("RuntimeException " + e);
+      numPagesPosted = postWebPages(args, 0, out);
+      info(numPagesPosted + " web pages indexed.");
+    } catch(MalformedURLException e) {
+      fatal("Wrong URL trying to append /extract to "+solrUrl);
     }
+    return numPagesPosted;
+  }
+
+  private void doStdinMode() {
+    info("POSTing stdin to " + solrUrl + "..");
+    postData(System.in, null, out, type, solrUrl);    
   }
 
+  private void reset() {
+    fileTypes = DEFAULT_FILE_TYPES;
+    globFileFilter = this.getFileFilterFromFileTypes(fileTypes);
+    backlog = new ArrayList<LinkedHashSet<URL>>();
+    visited = new HashSet<URL>();
+  }
+
+
+  //
+  // USAGE
+  //
   private static void usageShort() {
     System.out.println(USAGE_STRING_SHORT+"\n"+
         "       Please invoke with -h option for extended usage help.");
@@ -200,11 +344,12 @@ public class SimplePostTool {
     System.out.println
     (USAGE_STRING_SHORT+"\n\n" +
      "Supported System Properties and their defaults:\n"+
-     "  -Ddata=files|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
+     "  -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
      "  -Dtype=<content-type> (default=" + DEFAULT_CONTENT_TYPE + ")\n"+
      "  -Durl=<solr-update-url> (default=" + DEFAULT_POST_URL + ")\n"+
      "  -Dauto=yes|no (default=" + DEFAULT_AUTO + ")\n"+
-     "  -Drecursive=yes|no (default=" + DEFAULT_RECURSIVE + ")\n"+
+     "  -Drecursive=yes|no|<depth> (default=" + DEFAULT_RECURSIVE + ")\n"+
+     "  -Ddelay=<seconds> (default=0 for files, 10 for web)\n"+
      "  -Dfiletypes=<type>[,<type>,...] (default=" + DEFAULT_FILE_TYPES + ")\n"+
      "  -Dparams=\"<key>=<value>[&<key>=<value>...]\" (values must be URL-encoded)\n"+
      "  -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n"+
@@ -212,11 +357,12 @@ public class SimplePostTool {
      "  -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"+
      "This is a simple command line tool for POSTing raw data to a Solr\n"+
      "port.  Data can be read from files specified as commandline args,\n"+
-     "as raw commandline arg strings, or via STDIN.\n"+
+     "URLs specified as args, as raw commandline arg strings or via STDIN.\n"+
      "Examples:\n"+
      "  java -jar post.jar *.xml\n"+
      "  java -Ddata=args  -jar post.jar '<delete><id>42</id></delete>'\n"+
      "  java -Ddata=stdin -jar post.jar < hd.xml\n"+
+     "  java -Ddata=web -jar post.jar http://example.com/\n"+
      "  java -Dtype=text/csv -jar post.jar *.csv\n"+
      "  java -Dtype=application/json -jar post.jar *.json\n"+
      "  java -Durl=http://localhost:8983/solr/update/extract -Dparams=literal.id=a -Dtype=application/pdf -jar post.jar a.pdf\n"+
@@ -228,13 +374,10 @@ public class SimplePostTool {
      "or optimize should be executed, and whether the response should\n"+
      "be written to STDOUT. If auto=yes the tool will try to set type\n"+
      "and url automatically from file name. When posting rich documents\n"+
-     "the file name will be propagated as \"resource.name\" and also used as \"literal.id\".\n" +
-     "You may override these or any other request parameter through the -Dparams property.\n"+
-     "If you want to do a commit only, use \"-\" as argument.");
-  }
-
-  private static boolean isOn(String property) {
-    return(TRUE_STRINGS.indexOf(property) >= 0);
+     "the file name will be propagated as \"resource.name\" and also used\n"+
+     "as \"literal.id\". You may override these or any other request parameter\n"+
+     "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"+
+     "The web mode is a simple crawler following links within domain, default delay=10s.");
   }
 
   /** Post all filenames provided in args
@@ -244,7 +387,8 @@ public class SimplePostTool {
    * @param type default content-type to use when posting (may be overridden in auto mode)
    * @return number of files posted
    * */
-  int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
+  public int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
+    reset();
     int filesPosted = 0;
     for (int j = startIndexInArgs; j < args.length; j++) {
       File srcFile = new File(args[j]);
@@ -258,7 +402,7 @@ public class SimplePostTool {
         String fileGlob = srcFile.getName();
         GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
         File[] files = parent.listFiles(ff);
-        if(files.length == 0) {
+        if(files == null || files.length == 0) {
           warn("No files or directories matching "+srcFile);
           continue;          
         }
@@ -268,32 +412,255 @@ public class SimplePostTool {
     return filesPosted;
   }
   
+  /** Post all filenames provided in args
+   * @param files array of Files
+   * @param startIndexInArgs offset to start
+   * @param out output stream to post data to
+   * @param type default content-type to use when posting (may be overridden in auto mode)
+   * @return number of files posted
+   * */
+  public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {
+    reset();
+    int filesPosted = 0;
+    for (File srcFile : files) {
+      if(srcFile.isDirectory() && srcFile.canRead()) {
+        filesPosted += postDirectory(srcFile, out, type);
+      } else if (srcFile.isFile() && srcFile.canRead()) {
+        filesPosted += postFiles(new File[] {srcFile}, out, type);
+      } else {
+        File parent = srcFile.getParentFile();
+        if(parent == null) parent = new File(".");
+        String fileGlob = srcFile.getName();
+        GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
+        File[] fileList = parent.listFiles(ff);
+        if(fileList == null || fileList.length == 0) {
+          warn("No files or directories matching "+srcFile);
+          continue;          
+        }
+        filesPosted += postFiles(fileList, out, type);
+      }
+    }
+    return filesPosted;
+  }
+  
+  /**
+   * Posts a whole directory
+   * @return number of files posted total
+   */
   private int postDirectory(File dir, OutputStream out, String type) {
     if(dir.isHidden() && !dir.getName().equals("."))
       return(0);
-    info("Indexing directory "+dir.getPath());
+    info("Indexing directory "+dir.getPath()+" ("+dir.listFiles(globFileFilter).length+" files, depth="+currentDepth+")");
     int posted = 0;
     posted += postFiles(dir.listFiles(globFileFilter), out, type);
-    if(recursive) {
+    if(recursive > currentDepth) {
       for(File d : dir.listFiles()) {
-        if(d.isDirectory())
+        if(d.isDirectory()) {
+          currentDepth++;
           posted += postDirectory(d, out, type);
+          currentDepth--;
+        }
       }
     }
     return posted;
   }
 
+  /**
+   * Posts a list of file names
+   * @return number of files posted
+   */
   int postFiles(File[] files, OutputStream out, String type) {
     int filesPosted = 0;
     for(File srcFile : files) {
-      if(!srcFile.isFile() || srcFile.isHidden())
-        continue;
-      postFile(srcFile, out, type);
-      filesPosted++;
+      try {
+        if(!srcFile.isFile() || srcFile.isHidden())
+          continue;
+        postFile(srcFile, out, type);
+        Thread.sleep(delay * 1000);
+        filesPosted++;
+      } catch (InterruptedException e) {
+        throw new RuntimeException();
+      }
     }
     return filesPosted;
   }
 
+  /**
+   * This method takes as input a list of start URL strings for crawling,
+   * adds each one to a backlog and then starts crawling
+   * @param args the raw input args from main()
+   * @param startIndexInArgs offset for where to start
+   * @param out outputStream to write results to
+   * @return the number of web pages posted
+   */
+  public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
+    reset();
+    LinkedHashSet<URL> s = new LinkedHashSet<URL>();
+    for (int j = startIndexInArgs; j < args.length; j++) {
+      try {
+        URL u = new URL(normalizeUrlEnding(args[j]));
+        s.add(u);
+      } catch(MalformedURLException e) {
+        warn("Skipping malformed input URL: "+args[j]);
+      }
+    }
+    // Add URLs to level 0 of the backlog and start recursive crawling
+    backlog.add(s);
+    return webCrawl(0, out);
+  }
+
+  /**
+   * Normalizes a URL string by removing anchor part and trailing slash
+   * @return the normalized URL string
+   */
+  protected static String normalizeUrlEnding(String link) {
+    if(link.indexOf("#") > -1)
+      link = link.substring(0,link.indexOf("#"));
+    if(link.endsWith("?"))
+      link = link.substring(0,link.length()-1);
+    if(link.endsWith("/"))
+      link = link.substring(0,link.length()-1);
+    return link;
+  }
+
+  /**
+   * A very simple crawler, pulling URLs to fetch from a backlog and then
+   * recurses N levels deep if recursive>0. Links are parsed from HTML
+   * through first getting an XHTML version using SolrCell with extractOnly,
+   * and followed if they are local. The crawler pauses for a default delay
+   * of 10 seconds between each fetch, this can be configured in the delay
+   * variable. This is only meant for test purposes, as it does not respect
+   * robots or anything else fancy :)
+   * @param level which level to crawl
+   * @param out output stream to write to
+   * @return number of pages crawled on this level and below
+   */
+  protected int webCrawl(int level, OutputStream out) {
+    int numPages = 0;
+    LinkedHashSet<URL> stack = backlog.get(level);
+    int rawStackSize = stack.size();
+    stack.removeAll(visited);
+    int stackSize = stack.size();
+    LinkedHashSet<URL> subStack = new LinkedHashSet<URL>();
+    info("Entering crawl at level "+level+" ("+rawStackSize+" links total, "+stackSize+" new)");
+    for(URL u : stack) {
+      try {
+        visited.add(u);
+        PageFetcherResult result = pageFetcher.readPageFromUrl(u);
+        if(result.httpStatus == 200) {
+          u = (result.redirectUrl != null) ? result.redirectUrl : u;
+          URL postUrl = new URL(appendParam(solrUrl.toString(), 
+              "literal.id="+URLEncoder.encode(u.toString(),"UTF-8") +
+              "&literal.url="+URLEncoder.encode(u.toString(),"UTF-8")));
+          boolean success = postData(new ByteArrayInputStream(result.content), null, out, result.contentType, postUrl);
+          if (success) {
+            info("POSTed web resource "+u+" (depth: "+level+")");
+            Thread.sleep(delay * 1000);
+            numPages++;
+            // Pull links from HTML pages only
+            if(recursive > level && result.contentType.equals("text/html")) {
+              Set<URL> children = pageFetcher.getLinksFromWebPage(u, new ByteArrayInputStream(result.content), result.contentType, postUrl);
+              subStack.addAll(children);
+            }
+          } else {
+            warn("An error occurred while posting "+u);
+          }
+        } else {
+          warn("The URL "+u+" returned a HTTP result status of "+result.httpStatus);
+        }
+      } catch (IOException e) {
+        warn("Caught exception when trying to open connection to "+u+": "+e.getMessage());
+      } catch (InterruptedException e) {
+        throw new RuntimeException();
+      }
+    }
+    if(!subStack.isEmpty()) {
+      backlog.add(subStack);
+      numPages += webCrawl(level+1, out);
+    }
+    return numPages;    
+  }
+
+  /**
+   * Reads an input stream into a byte array
+   * @param is the input stream
+   * @return the byte array
+   * @throws IOException
+   */
+  protected byte[] inputStreamToByteArray(InputStream is) throws IOException {
+    ByteArrayOutputStream bos = new ByteArrayOutputStream();
+    int next = is.read();
+    while (next > -1) {
+        bos.write(next);
+        next = is.read();
+    }
+    bos.flush();
+    is.close();
+    return bos.toByteArray();
+  }
+
+  /**
+   * Computes the full URL based on a base url and a possibly relative link found
+   * in the href param of an HTML anchor.
+   * @param baseUrl the base url from where the link was found
+   * @param link the absolute or relative link
+   * @return the string version of the full URL
+   */
+  protected String computeFullUrl(URL baseUrl, String link) {
+    if(link == null || link.length() == 0) {
+      return null;
+    }
+    if(!link.startsWith("http")) {
+      if(link.startsWith("/")) {
+        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
+      } else {
+        if(link.contains(":")) {
+          return null; // Skip non-relative URLs
+        }
+        String path = baseUrl.getPath();
+        if(!path.endsWith("/")) {
+          int sep = path.lastIndexOf("/");
+          String file = path.substring(sep+1);
+          if(file.contains(".") || file.contains("?"))
+            path = path.substring(0,sep);
+        }
+        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
+      }
+    }
+    link = normalizeUrlEnding(link);
+    String l = link.toLowerCase(Locale.ROOT);
+    // Simple brute force skip images
+    if(l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
+      return null; // Skip images
+    }
+    return link;
+  }
+
+  /**
+   * Uses the mime-type map to reverse lookup whether the file ending for our type
+   * is supported by the fileTypes option
+   * @param type what content-type to lookup
+   * @return true if this is a supported content type
+   */
+  protected boolean typeSupported(String type) {
+    for(String key : mimeMap.keySet()) {
+      if(mimeMap.get(key).equals(type)) {
+        if(fileTypes.contains(key))
+          return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Tests if a string is either "true", "on", "yes" or "1"
+   * @param property the string to test
+   * @return true if "on"
+   */
+  protected static boolean isOn(String property) {
+    return("true,on,yes,1".indexOf(property) > -1);
+  }
+  
   static void warn(String msg) {
     System.err.println("SimplePostTool: WARNING: " + msg);
   }
@@ -304,21 +671,14 @@ public class SimplePostTool {
 
   static void fatal(String msg) {
     System.err.println("SimplePostTool: FATAL: " + msg);
-    System.exit(1);
-  }
-
-  /**
-   * Constructs an instance for posting data to the specified Solr URL 
-   * (ie: "http://localhost:8983/solr/update")
-   */
-  public SimplePostTool(URL solrUrl) {
-    this.solrUrl = solrUrl;
+    System.exit(2);
   }
 
   /**
    * Does a simple commit operation 
    */
   public void commit() {
+    info("COMMITting Solr index changes to " + solrUrl + "..");
     doGet(appendParam(solrUrl.toString(), "commit=true"));
   }
 
@@ -326,9 +686,16 @@ public class SimplePostTool {
    * Does a simple optimize operation 
    */
   public void optimize() {
+    info("Performing an OPTIMIZE to " + solrUrl + "..");
     doGet(appendParam(solrUrl.toString(), "optimize=true"));
   }
 
+  /**
+   * Appends a URL query parameter to a URL 
+   * @param url the original URL
+   * @param param the parameter(s) to append, separated by "&"
+   * @return the string version of the resulting URL
+   */
   public static String appendParam(String url, String param) {
     String[] pa = param.split("&");
     for(String p : pa) {
@@ -360,13 +727,12 @@ public class SimplePostTool {
             // Default handler
           } else {
             // SolrCell
-            String urlStr = url.getProtocol() + "://" + url.getAuthority() + url.getPath() + "/extract" + (url.getQuery() != null ? "?"+url.getQuery() : "");
+            String urlStr = appendUrlPath(solrUrl, "/extract").toString();
             if(urlStr.indexOf("resource.name")==-1)
               urlStr = appendParam(urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
             if(urlStr.indexOf("literal.id")==-1)
               urlStr = appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
             url = new URL(urlStr);
-//            info("Indexing to ExtractingRequestHandler with URL "+url);
           }
         } else {
           warn("Skipping "+file.getName()+". Unsupported file type for auto mode.");
@@ -390,7 +756,23 @@ public class SimplePostTool {
     }
   }
 
-  private String guessType(File file) {
+  /**
+   * Appends to the path of the URL
+   * @param url the URL
+   * @param append the path to append
+   * @return the final URL version 
+   * @throws MalformedURLException
+   */
+  protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
+    return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append + (url.getQuery() != null ? "?"+url.getQuery() : ""));
+  }
+
+  /**
+   * Guesses the type of a file, based on file name suffix
+   * @param file the file
+   * @return the content-type guessed
+   */
+  protected static String guessType(File file) {
     String name = file.getName();
     String suffix = name.substring(name.lastIndexOf(".")+1);
     return mimeMap.get(suffix.toLowerCase(Locale.ROOT));
@@ -412,6 +794,7 @@ public class SimplePostTool {
    */
   public static void doGet(URL url) {
     try {
+      if(mockMode) return;
       HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
       if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
         warn("Solr returned an error #" + urlc.getResponseCode() + 
@@ -422,15 +805,14 @@ public class SimplePostTool {
     }
   }
 
-  public void postData(InputStream data, Integer length, OutputStream output, String type) {
-    postData(data, length, output, type, solrUrl);
-  }
-
   /**
    * Reads data from the data stream and posts it to solr,
    * writes to the response to output
+   * @return true if success
    */
-  public void postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
+  public boolean postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
+    if(mockMode) return true;
+    boolean success = true;
     if(type == null)
       type = DEFAULT_CONTENT_TYPE;
     HttpURLConnection urlc = null;
@@ -441,7 +823,6 @@ public class SimplePostTool {
           urlc.setRequestMethod("POST");
         } catch (ProtocolException e) {
           fatal("Shouldn't happen: HttpURLConnection doesn't support POST??"+e);
-                
         }
         urlc.setDoOutput(true);
         urlc.setDoInput(true);
@@ -453,6 +834,7 @@ public class SimplePostTool {
 
       } catch (IOException e) {
         fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
+        success = false;
       }
       
       OutputStream out = null;
@@ -461,6 +843,7 @@ public class SimplePostTool {
         pipe(data, out);
       } catch (IOException e) {
         fatal("IOException while posting data: " + e);
+        success = false;
       } finally {
         try { if(out!=null) out.close(); } catch (IOException x) { /*NOOP*/ }
       }
@@ -470,12 +853,14 @@ public class SimplePostTool {
         if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
           warn("Solr returned an error #" + urlc.getResponseCode() + 
                 " " + urlc.getResponseMessage());
+          success = false;
         }
 
         in = urlc.getInputStream();
         pipe(in, output);
       } catch (IOException e) {
         warn("IOException while reading response: " + e);
+        success = false;
       } finally {
         try { if(in!=null) in.close(); } catch (IOException x) { /*NOOP*/ }
       }
@@ -483,8 +868,14 @@ public class SimplePostTool {
     } finally {
       if(urlc!=null) urlc.disconnect();
     }
+    return success;
   }
 
+  /**
+   * Converts a string to an input stream 
+   * @param s the string
+   * @return the input stream
+   */
   public static InputStream stringToStream(String s) {
     InputStream is = null;
     try {
@@ -508,36 +899,64 @@ public class SimplePostTool {
     if (null != dest) dest.flush();
   }
 
-  public boolean isAuto() {
-    return auto;
-  }
-
-  public void setAuto(boolean auto) {
-    this.auto = auto;
-  }
-
-  public boolean isRecursive() {
-    return recursive;
-  }
-
-  public void setRecursive(boolean recursive) {
-    this.recursive = recursive;
-  }
-
-  public String getFileTypes() {
-    return fileTypes;
-  }
-
-  public void setFileTypes(String fileTypes) {
-    this.fileTypes = fileTypes;
+  public GlobFileFilter getFileFilterFromFileTypes(String fileTypes) {
     String glob;
     if(fileTypes.equals("*"))
       glob = ".*";
     else
       glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
-    this.globFileFilter = new GlobFileFilter(glob, true);
+    return new GlobFileFilter(glob, true);
+  }
+
+  //
+  // Utility methods for XPath handing
+  //
+  
+  /**
+   * Gets all nodes matching an XPath
+   */
+  public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
+    XPathFactory factory = XPathFactory.newInstance();
+    XPath xp = factory.newXPath();
+    XPathExpression expr = xp.compile(xpath);
+    return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
+  }
+  
+  /**
+   * Gets the string content of the matching an XPath
+   * @param n the node (or doc)
+   * @param xpath the xpath string
+   * @param concatAll if true, text from all matching nodes will be concatenated, else only the first returned
+   */
+  public static String getXP(Node n, String xpath, boolean concatAll)
+      throws XPathExpressionException {
+    NodeList nodes = getNodesFromXP(n, xpath);
+    StringBuffer sb = new StringBuffer();
+    if (nodes.getLength() > 0) {
+      for(int i = 0; i < nodes.getLength() ; i++) {
+        sb.append(nodes.item(i).getNodeValue() + " ");
+        if(!concatAll) break;
+      }
+      return sb.toString().trim();
+    } else
+      return "";
+  }
+  
+  /**
+   * Takes a string as input and returns a DOM 
+   */
+  public static Document makeDom(String in, String inputEncoding) throws SAXException, IOException,
+  ParserConfigurationException {
+    InputStream is = new ByteArrayInputStream(in
+        .getBytes(inputEncoding));
+    Document dom = DocumentBuilderFactory.newInstance()
+        .newDocumentBuilder().parse(is);
+    return dom;
   }
 
+  /**
+   * Inner class to filter files based on glob wildcards
+   */
   class GlobFileFilter implements FileFilter
   {
     private String _pattern;
@@ -571,4 +990,170 @@ public class SimplePostTool {
       return p.matcher(file.getName()).find();
     }
   }
+  
+  //
+  // Simple crawler class which can fetch a page and check for robots.txt
+  //
+  class PageFetcher {
+    Map<String, List<String>> robotsCache;
+    final String DISALLOW = "Disallow:";
+    
+    public PageFetcher() {
+      robotsCache = new HashMap<String,List<String>>();
+    }
+    
+    public PageFetcherResult readPageFromUrl(URL u) {
+      PageFetcherResult res = new PageFetcherResult();
+      try {
+        if (isDisallowedByRobots(u)) {
+          warn("The URL "+u+" is disallowed by robots.txt and will not be crawled.");
+          res.httpStatus = 403;
+          visited.add(u);
+          return res;
+        }
+        res.httpStatus = 404;
+        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
+        conn.setRequestProperty("User-Agent", "SimplePostTool-crawler/"+VERSION_OF_THIS_TOOL+" (http://lucene.apache.org/solr/)");
+        conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+        conn.connect();
+        res.httpStatus = conn.getResponseCode();
+        if(!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
+          info("The URL "+u+" caused a redirect to "+conn.getURL());
+          u = conn.getURL();
+          res.redirectUrl = u;
+          visited.add(u);
+        }
+        if(res.httpStatus == 200) {
+          // Raw content type of form "text/html; encoding=utf-8"
+          String rawContentType = conn.getContentType();
+          String type = rawContentType.split(";")[0];
+          if(typeSupported(type)) {
+            String encoding = conn.getContentEncoding();
+            InputStream is;
+            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+              is = new GZIPInputStream(conn.getInputStream());
+            } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
+              is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
+            } else {
+              is = conn.getInputStream();
+            }
+            
+            // Read into memory, so that we later can pull links from the page without re-fetching 
+            res.content = inputStreamToByteArray(is);
+            is.close();
+          } else {
+            warn("Skipping URL with unsupported type "+type);
+            res.httpStatus = 415;
+          }
+        }
+      } catch(IOException e) {
+        warn("IOException when reading page from url "+u+": "+e.getMessage());
+      }
+      return res;
+    }
+    
+    public boolean isDisallowedByRobots(URL url) {
+      String host = url.getHost();
+      String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
+      List<String> disallows = robotsCache.get(host);
+      if(disallows == null) {
+        disallows = new ArrayList<String>();
+        URL urlRobot;
+        try { 
+          urlRobot = new URL(strRobot);
+          disallows = parseRobotsTxt(urlRobot.openStream());
+        } catch (MalformedURLException e) {
+          return true; // We cannot trust this robots URL, should not happen
+        } catch (IOException e) {
+          // There is no robots.txt, will cache an empty disallow list
+        }
+      }
+      
+      robotsCache.put(host, disallows);
+
+      String strURL = url.getFile();
+      for (String path : disallows) {
+        if (path.equals("/") || strURL.indexOf(path) == 0)
+          return true;
+      }
+      return false;
+    }
+
+    /**
+     * Very simple robots.txt parser which obeys all Disallow lines regardless
+     * of user agent or whether there are valid Allow: lines.
+     * @param is Input stream of the robots.txt file
+     * @return a list of disallow paths
+     * @throws IOException if problems reading the stream
+     */
+    protected List<String> parseRobotsTxt(InputStream is) throws IOException {
+      List<String> disallows = new ArrayList<String>();
+      BufferedReader r = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+      String l;
+      while((l = r.readLine()) != null) {
+        String[] arr = l.split("#");
+        if(arr.length == 0) continue;
+        l = arr[0].trim();
+        if(l.startsWith(DISALLOW)) {
+          l = l.substring(DISALLOW.length()).trim();
+          if(l.length() == 0) continue;
+          disallows.add(l);
+        }
+      }
+      is.close();
+      return disallows;
+    }
+
+    /**
+     * Finds links on a web page, using /extract?extractOnly=true
+     * @param u the URL of the web page
+     * @param is the input stream of the page
+     * @param type the content-type
+     * @param postUrl the URL (typically /solr/extract) in order to pull out links
+     * @return a set of URLs parsed from the page
+     */
+    protected Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
+      Set<URL> l = new HashSet<URL>();
+      URL url = null;
+      try {
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
+        boolean success = postData(is, null, os, type, extractUrl);
+        if(success) {
+          String rawXml = os.toString("UTF-8");
+          Document d = makeDom(rawXml, "UTF-8");
+          String innerXml = getXP(d, "/response/str/text()[1]", false);
+          d = makeDom(innerXml, "UTF-8");
+          NodeList links = getNodesFromXP(d, "/html/body//a/@href");
+          for(int i = 0; i < links.getLength(); i++) {
+            String link = links.item(i).getTextContent();
+            link = computeFullUrl(u, link);
+            if(link == null)
+              continue;
+            url = new URL(link);
+            if(url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority()))
+              continue;
+            l.add(url);
+          }
+        }
+      } catch (MalformedURLException e) {
+        warn("Malformed URL "+url);
+      } catch (IOException e) {
+        warn("IOException opening URL "+url+": "+e.getMessage());
+      } catch (Exception e) {
+        throw new RuntimeException();
+      }
+      return l;
+    }
+  }
+    
+  /**
+   * Utility class to hold the result form a page fetch
+   */
+  public class PageFetcherResult {
+    int httpStatus = 200;
+    String contentType = "text/html";
+    URL redirectUrl = null;
+    byte[] content;
+  }
 }

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/SolrPluginUtils.java Fri Sep 21 17:21:34 2012
@@ -19,6 +19,8 @@ package org.apache.solr.util;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.StoredDocument;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.*;
@@ -229,57 +231,69 @@ public class SolrPluginUtils {
    * @return The debug info
    * @throws java.io.IOException if there was an IO error
    */
-  public static NamedList doStandardDebug(SolrQueryRequest req,
-                                          String userQuery,
-                                          Query query,
-                                          DocList results, boolean dbgQuery, boolean dbgResults)
-    throws IOException {
-
-    NamedList dbg = null;
-
-    dbg = new SimpleOrderedMap();
-
-    SolrIndexSearcher searcher = req.getSearcher();
-    IndexSchema schema = req.getSchema();
-
-    boolean explainStruct
-            = req.getParams().getBool(CommonParams.EXPLAIN_STRUCT, false);
-
+  public static NamedList doStandardDebug(
+          SolrQueryRequest req,
+          String userQuery,
+          Query query,
+          DocList results,
+          boolean dbgQuery,
+          boolean dbgResults)
+          throws IOException 
+  {
+    NamedList dbg = new SimpleOrderedMap();
+    doStandardQueryDebug(req, userQuery, query, dbgQuery, dbg);
+    doStandardResultsDebug(req, query, results, dbgResults, dbg);
+    return dbg;
+  }
+  
+  public static void doStandardQueryDebug(
+          SolrQueryRequest req,
+          String userQuery,
+          Query query,
+          boolean dbgQuery,
+          NamedList dbg)
+  {
     if (dbgQuery) {
       /* userQuery may have been pre-processed .. expose that */
       dbg.add("rawquerystring", req.getParams().get(CommonParams.Q));
       dbg.add("querystring", userQuery);
 
-      /* QueryParsing.toString isn't perfect, use it to see converted
+     /* QueryParsing.toString isn't perfect, use it to see converted
       * values, use regular toString to see any attributes of the
       * underlying Query it may have missed.
       */
-      dbg.add("parsedquery", QueryParsing.toString(query, schema));
+      dbg.add("parsedquery", QueryParsing.toString(query, req.getSchema()));
       dbg.add("parsedquery_toString", query.toString());
     }
-
+  }
+  
+  public static void doStandardResultsDebug(
+          SolrQueryRequest req,
+          Query query,
+          DocList results,
+          boolean dbgResults,
+          NamedList dbg) throws IOException
+  {
     if (dbgResults) {
-      NamedList<Explanation> explain
-              = getExplanations(query, results, searcher, schema);
-      dbg.add("explain", explainStruct ?
-              explanationsToNamedLists(explain) :
-              explanationsToStrings(explain));
+      SolrIndexSearcher searcher = req.getSearcher();
+      IndexSchema schema = req.getSchema();
+      boolean explainStruct = req.getParams().getBool(CommonParams.EXPLAIN_STRUCT, false);
+
+      NamedList<Explanation> explain = getExplanations(query, results, searcher, schema);
+      dbg.add("explain", explainStruct
+              ? explanationsToNamedLists(explain)
+              : explanationsToStrings(explain));
 
       String otherQueryS = req.getParams().get(CommonParams.EXPLAIN_OTHER);
       if (otherQueryS != null && otherQueryS.length() > 0) {
-        DocList otherResults = doSimpleQuery
-                (otherQueryS, req, 0, 10);
+        DocList otherResults = doSimpleQuery(otherQueryS, req, 0, 10);
         dbg.add("otherQuery", otherQueryS);
-        NamedList<Explanation> explainO
-                = getExplanations(query, otherResults, searcher, schema);
-        dbg.add("explainOther", explainStruct ?
-                explanationsToNamedLists(explainO) :
-                explanationsToStrings(explainO));
+        NamedList<Explanation> explainO = getExplanations(query, otherResults, searcher, schema);
+        dbg.add("explainOther", explainStruct
+                ? explanationsToNamedLists(explainO)
+                : explanationsToStrings(explainO));
       }
     }
-
-
-    return dbg;
   }
 
   public static NamedList<Object> explanationToNamedList(Explanation e) {
@@ -332,7 +346,7 @@ public class SolrPluginUtils {
     for (int i=0; i<docs.size(); i++) {
       int id = iterator.nextDoc();
 
-      Document doc = searcher.doc(id);
+      StoredDocument doc = searcher.doc(id);
       String strid = schema.printableUniqueKey(doc);
 
       explainList.add(strid, searcher.explain(query, id) );
@@ -848,10 +862,10 @@ public class SolrPluginUtils {
     while (dit.hasNext()) {
       int docid = dit.nextDoc();
 
-      Document luceneDoc = searcher.doc(docid, fields);
+      StoredDocument luceneDoc = searcher.doc(docid, fields);
       SolrDocument doc = new SolrDocument();
       
-      for( IndexableField field : luceneDoc) {
+      for( StorableField field : luceneDoc) {
         if (null == fields || fields.contains(field.name())) {
           SchemaField sf = schema.getField( field.name() );
           doc.addField( field.name(), sf.getType().toObject( field ) );

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/org/apache/solr/util/plugin/AbstractPluginLoader.java Fri Sep 21 17:21:34 2012
@@ -72,9 +72,12 @@ public abstract class AbstractPluginLoad
   
   /**
    * Create a plugin from an XML configuration.  Plugins are defined using:
-   *   <plugin name="name1" class="solr.ClassName">
+   * <pre class="prettyprint">
+   * {@code
+   * <plugin name="name1" class="solr.ClassName">
    *      ...
-   *   </plugin>
+   * </plugin>}
+   * </pre>
    * 
    * @param name - The registered name.  In the above example: "name1"
    * @param className - class name for requested plugin.  In the above example: "solr.ClassName"
@@ -101,16 +104,19 @@ public abstract class AbstractPluginLoad
   abstract protected void init( T plugin, Node node ) throws Exception;
 
   /**
+   * Initializes and registers each plugin in the list.
    * Given a NodeList from XML in the form:
-   * 
-   *  <plugins>
+   * <pre class="prettyprint">
+   * {@code
+   * <plugins>
    *    <plugin name="name1" class="solr.ClassName" >
    *      ...
    *    </plugin>
    *    <plugin name="name2" class="solr.ClassName" >
    *      ...
    *    </plugin>
-   *  </plugins>
+   * </plugins>}
+   * </pre>
    * 
    * This will initialize and register each plugin from the list.  A class will 
    * be generated for each class name and registered to the given name.
@@ -120,7 +126,10 @@ public abstract class AbstractPluginLoad
    * plugins at startup.
    * 
    * One (and only one) plugin may declare itself to be the 'default' plugin using:
-   *    <plugin name="name2" class="solr.ClassName" default="true">
+   * <pre class="prettyprint">
+   * {@code
+   *    <plugin name="name2" class="solr.ClassName" default="true">}
+   * </pre>
    * If a default element is defined, it will be returned from this function.
    * 
    */
@@ -190,9 +199,13 @@ public abstract class AbstractPluginLoad
   }
   
   /**
-   * Given a NodeList from XML in the form:
+   * Initializes and registers a single plugin.
    * 
-   * <plugin name="name1" class="solr.ClassName" > ... </plugin>
+   * Given a NodeList from XML in the form:
+   * <pre class="prettyprint">
+   * {@code
+   * <plugin name="name1" class="solr.ClassName" > ... </plugin>}
+   * </pre>
    * 
    * This will initialize and register a single plugin. A class will be
    * generated for the plugin and registered to the given name.

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/java/overview.html?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/java/overview.html (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/java/overview.html Fri Sep 21 17:21:34 2012
@@ -16,6 +16,6 @@
 -->
 <html>
 <body>
-Apache Solr Search Server, new users should familiarize themselves with the <a href="doc-files/tutorial.html">Solr Tutorial</a>.
+Apache Solr Search Server (Core Javadocs).
 </body>
 </html>

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/elevate.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/elevate.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/elevate.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/elevate.xml Fri Sep 21 17:21:34 2012
@@ -41,6 +41,11 @@
   <doc id="5" />
   <doc id="6" exclude="true" />
  </query>
+ 
+ <query text="QQQQ">
+  <doc id="10" exclude="true" />
+ </query>
+ 
 
  <query text="solr">
   <doc id="7" />

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml Fri Sep 21 17:21:34 2012
@@ -206,6 +206,16 @@
       </analyzer>
     </fieldType>
 
+    <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">
+      <analyzer> 
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- removes l', etc -->
+        <filter class="solr.ElisionFilterFactory" ignoreCase="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.FrenchLightStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
 
     <fieldType name="int" class="solr.TrieIntField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="float" class="solr.TrieFloatField" precisionStep="4" omitNorms="true" positionIncrementGap="0"/>
@@ -250,6 +260,7 @@
     <dynamicField name="*_folding" type="text_folding" indexed="true" stored="true"/>
     <dynamicField name="*_stemming" type="text_stemming" indexed="true" stored="true"/>
     <dynamicField name="*_keyword" type="text_keyword" indexed="true" stored="true"/>
+    <dynamicField name="*_fr" type="text_fr" indexed="true" stored="true"/>
 
   </fields>
 

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema.xml Fri Sep 21 17:21:34 2012
@@ -399,9 +399,11 @@
   <fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
 
   <!-- Currency type -->
-  <fieldType name="currency" class="solr.CurrencyField" currencyConfig="currency.xml"/>
-  <fieldType name="mock_currency" class="solr.CurrencyField" providerClass="solr.MockExchangeRateProvider" foo="bar" />
-  <fieldType name="openexchangeratesorg_currency" class="solr.CurrencyField" 
+  <fieldType name="currency" class="solr.CurrencyField" currencyConfig="currency.xml" multiValued="false" />
+  <fieldType name="mock_currency" class="solr.CurrencyField" providerClass="solr.MockExchangeRateProvider" foo="bar" multiValued="false" />
+  <fieldType name="openexchangeratesorg_currency" 
+             class="solr.CurrencyField" 
+             multiValued="false"
              providerClass="solr.OpenExchangeRatesOrgProvider"
              ratesFileLocation="open-exchange-rates.json" />
 

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema12.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema12.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema12.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema12.xml Fri Sep 21 17:21:34 2012
@@ -591,6 +591,10 @@
 
    <dynamicField name="random_*" type="random" />
 
+   <!-- unused, for testing luke copyFields -->
+   <dynamicField name="foo_copysource_*" type="ignored" multiValued="true"/>
+   <dynamicField name="bar_copydest_*" type="ignored" multiValued="true"/>
+
  </fields>
 
  <defaultSearchField>text</defaultSearchField>
@@ -601,5 +605,7 @@
 
    <copyField source="title" dest="text"/>
    <copyField source="subject" dest="text"/>
+
+   <copyField source="foo_copysource_*" dest="bar_copydest_*" />
  
 </schema>

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema15.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema15.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema15.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/schema15.xml Fri Sep 21 17:21:34 2012
@@ -425,7 +425,7 @@
    <field name="signatureField" type="string" indexed="true" stored="false"/>
    <field name="uuid" type="uuid" stored="true" />
    <field name="name" type="nametext" indexed="true" stored="true"/>
-   <field name="text" type="text" indexed="true" stored="false"/>
+   <field name="text" type="text" indexed="true" stored="false" multiValued="true" />
    <field name="subject" type="text" indexed="true" stored="true"/>
    <field name="title" type="nametext" indexed="true" stored="true"/>
    <field name="weight" type="float" indexed="true" stored="true"/>
@@ -522,6 +522,9 @@
    <!-- for versioning -->
    <field name="_version_" type="long" indexed="true" stored="true"/>
 
+    
+   <field name="copyfield_source" type="string" indexed="true" stored="true" multiValued="true"/>
+
 
 
    <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false"/>
@@ -587,5 +590,8 @@
 
    <copyField source="title" dest="text"/>
    <copyField source="subject" dest="text"/>
+
+   <copyField source="copyfield_source" dest="text"/>
+   <copyField source="copyfield_source" dest="copyfield_dest_ss"/>  <!-- copyField into another stored copyField - not best practice --> 
  
 </schema>

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-master.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-master.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-master.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-master.xml Fri Sep 21 17:21:34 2012
@@ -41,7 +41,10 @@
   <requestHandler name="/replication" class="solr.ReplicationHandler">
     <lst name="master">
       <str name="replicateAfter">commit</str>
-      <str name="confFiles">schema.xml</str>
+      <!-- we don't really need dummy.xsl, but we want to be sure subdir 
+           files replicate (see SOLR-3809)
+      -->
+      <str name="confFiles">schema.xml,xslt/dummy.xsl</str>
     </lst>
   </requestHandler>
 

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-repeater.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-repeater.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-repeater.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-repeater.xml Fri Sep 21 17:21:34 2012
@@ -55,7 +55,7 @@
       <str name="confFiles">schema.xml</str>
     </lst>
     <lst name="slave">
-      <str name="masterUrl">http://localhost:TEST_PORT/solr/replication</str>
+      <str name="masterUrl">http://127.0.0.1:TEST_PORT/solr/replication</str>
       <str name="pollInterval">00:00:01</str>
     </lst>
   </requestHandler>

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-slave.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-slave.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-slave.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-slave.xml Fri Sep 21 17:21:34 2012
@@ -51,9 +51,10 @@
 
   <requestHandler name="/replication" class="solr.ReplicationHandler">
 	<lst name="slave">
-		<str name="masterUrl">http://localhost:TEST_PORT/solr</str>
+		<str name="masterUrl">http://127.0.0.1:TEST_PORT/solr</str>
 		<str name="pollInterval">00:00:01</str>
-	</lst>
+        <str name="compression">COMPRESSION</str>
+     </lst>
   </requestHandler>
 
 

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-update-processor-chains.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-update-processor-chains.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-update-processor-chains.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-update-processor-chains.xml Fri Sep 21 17:21:34 2012
@@ -231,6 +231,12 @@
     </processor>
   </updateRequestProcessorChain>
 
+  <updateRequestProcessorChain name="count">
+    <processor class="solr.CountFieldValuesUpdateProcessorFactory">
+      <str name="fieldName">count_field</str>
+    </processor>
+  </updateRequestProcessorChain>
+
   <updateRequestProcessorChain name="ignore-not-in-schema">
     <processor class="solr.IgnoreFieldUpdateProcessorFactory" />
   </updateRequestProcessorChain>
@@ -344,6 +350,20 @@
     </processor>
   </updateRequestProcessorChain>
 
+  <updateRequestProcessorChain name="clone-then-count">
+    <processor class="solr.CloneFieldUpdateProcessorFactory">
+      <str name="source">category</str>
+      <str name="dest">category_count</str>
+    </processor>
+    <processor class="solr.CountFieldValuesUpdateProcessorFactory">
+      <str name="fieldName">category_count</str>
+    </processor>
+    <processor class="solr.DefaultValueUpdateProcessorFactory">
+      <str name="fieldName">category_count</str>
+      <int name="value">0</int>
+    </processor>
+  </updateRequestProcessorChain>
+
   <updateRequestProcessorChain name="regex-replace">
     <processor class="solr.RegexReplaceProcessorFactory">
       <str name="fieldName">content</str>

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-xinclude.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-xinclude.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-xinclude.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig-xinclude.xml Fri Sep 21 17:21:34 2012
@@ -25,4 +25,9 @@
       <xi:include href="solrconfig-reqHandler.incl" xmlns:xi="http://www.w3.org/2001/XInclude"/>
     </xi:fallback>
   </xi:include>
+
+  <updateRequestProcessorChain name="special-include" xmlns:xi="http://www.w3.org/2001/XInclude">
+    <xi:include href="solrconfig-snippet-processor.xml" />
+  </updateRequestProcessorChain>
+
 </config>

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml Fri Sep 21 17:21:34 2012
@@ -54,7 +54,7 @@
     -->
     <maxBufferedDocs>10</maxBufferedDocs>
     <mergePolicy class="org.apache.lucene.index.LogDocMergePolicy"/>
-    <lockType>single</lockType>
+    <lockType>native</lockType>
     <unlockOnStartup>true</unlockOnStartup>
   </indexConfig>
   

Modified: lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/solr.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/solr.xml?rev=1388574&r1=1388573&r2=1388574&view=diff
==============================================================================
--- lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/solr.xml (original)
+++ lucene/dev/branches/LUCENE-2878/solr/core/src/test-files/solr/solr.xml Fri Sep 21 17:21:34 2012
@@ -28,7 +28,8 @@
   adminPath: RequestHandler path to manage cores.  
     If 'null' (or absent), cores will not be manageable via request handler
   -->
-  <cores adminPath="/admin/cores" defaultCoreName="collection1" host="127.0.0.1" hostPort="${hostPort:8983}" hostContext="solr" zkClientTimeout="8000" numShards="${numShards:3}">
+  <cores adminPath="/admin/cores" defaultCoreName="collection1" host="127.0.0.1" hostPort="${hostPort:8983}" 
+         hostContext="solr" zkClientTimeout="5000" numShards="${numShards:3}" shareSchema="${shareSchema:false}">
     <core name="collection1" instanceDir="collection1" shard="${shard:}" collection="${collection:collection1}" config="${solrconfig:solrconfig.xml}" schema="${schema:schema.xml}"/>
   </cores>
 </solr>