You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@solr.apache.org by ep...@apache.org on 2024/02/19 17:10:34 UTC

(solr) branch branch_9x updated (7e40daa7ba2 -> f30d5e7975d)

This is an automated email from the ASF dual-hosted git repository.

epugh pushed a change to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


    from 7e40daa7ba2 solr.xml: honor plugin enable=true|false (#2260)
     new 8a427ebc606 SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)
     new f30d5e7975d backporting from main

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 solr/CHANGES.txt                                   |    3 +
 .../src/java/org/apache/solr/cli/PostTool.java     | 1166 +++++++++++++++++++-
 .../java/org/apache/solr/cli/RunExampleTool.java   |   30 +-
 .../src/test/org/apache/solr/cli/PostToolTest.java |  220 +++-
 .../apache/solr/cloud/SolrCloudExampleTest.java    |   59 +-
 solr/packaging/test/test_post.bats                 |   22 +-
 6 files changed, 1410 insertions(+), 90 deletions(-)

(solr) 02/02: backporting from main

Posted by ep...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git

commit f30d5e7975da5d7271ab05635ede694ff4668049
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Mon Feb 19 12:10:27 2024 -0500

    backporting from main
---
 solr/core/src/java/org/apache/solr/cli/PostTool.java               | 4 ++--
 solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cli/PostTool.java b/solr/core/src/java/org/apache/solr/cli/PostTool.java
index 0e3bc6b77c1..b75e839aa7f 100644
--- a/solr/core/src/java/org/apache/solr/cli/PostTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/PostTool.java
@@ -719,7 +719,7 @@ public class PostTool extends ToolBase {
     info("COMMITting Solr index changes to " + solrUpdateUrl + "...");
     String url = solrUpdateUrl.toString();
     url = url.substring(0, url.lastIndexOf("/update"));
-    try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+    try (final SolrClient client = SolrCLI.getSolrClient(url)) {
       client.commit();
     }
   }
@@ -729,7 +729,7 @@ public class PostTool extends ToolBase {
     info("Performing an OPTIMIZE to " + solrUpdateUrl + "...");
     String url = solrUpdateUrl.toString();
     url = url.substring(0, url.lastIndexOf("/update"));
-    try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+    try (final SolrClient client = SolrCLI.getSolrClient(url)) {
       client.optimize();
     }
   }
diff --git a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
index a40903c1f0a..b3087503a7b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
@@ -35,6 +35,7 @@ import org.apache.solr.util.ExternalPaths;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.solr.SolrTestCaseJ4;
 
 /**
  * Emulates bin/solr start -e cloud -noprompt; bin/solr post -c gettingstarted
@@ -42,6 +43,7 @@ import org.slf4j.LoggerFactory;
  * docs in collections that use data driven functionality and managed schema features of the default
  * configset (configsets/_default).
  */
+@SolrTestCaseJ4.SuppressSSL
 public class SolrCloudExampleTest extends AbstractFullDistribZkTestBase {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

(solr) 01/02: SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)

Posted by ep...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git

commit 8a427ebc606d7967bcaaef30a9449bc0bf61b25b
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Mon Feb 19 11:46:47 2024 -0500

    SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)
    
    * Copied unit tests from SimplePostToolTest to PostToolTest
    * add a --dry-run mode that simulates sending documents to Solr
    * Missed a few more places with -commit not needed post SOLR-17147 being completed
    * clean up our code base to have fewer warnings about code quality.
    * Update SolrCloudExampleTest to use the PostTool instead of simulating it's usage.
    * Make the long form of -url expliciting a --solr-update-url to be clear what it's for.
---
 solr/CHANGES.txt                                   |    3 +
 .../src/java/org/apache/solr/cli/PostTool.java     | 1166 +++++++++++++++++++-
 .../java/org/apache/solr/cli/RunExampleTool.java   |   30 +-
 .../src/test/org/apache/solr/cli/PostToolTest.java |  220 +++-
 .../apache/solr/cloud/SolrCloudExampleTest.java    |   57 +-
 solr/packaging/test/test_post.bats                 |   22 +-
 6 files changed, 1408 insertions(+), 90 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index ab43eb599a7..54c96bc87bb 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -26,6 +26,9 @@ Improvements
 
 * SOLR-17145: The INSTALLSHARDDATA API now includes a 'requestid' field when run asynchronously (Jason Gerlowski)
 
+* SOLR-17159: bin/solr post now has proper unit testing.  Users can specify a --dry-run option to 
+  simulate posting documents without sending them to Solr. (Eric Pugh)
+
 Optimizations
 ---------------------
 * SOLR-17144: Close searcherExecutor thread per core after 1 minute (Pierre Salagnac, Christine Poerschke)
diff --git a/solr/core/src/java/org/apache/solr/cli/PostTool.java b/solr/core/src/java/org/apache/solr/cli/PostTool.java
index de716c131cb..0e3bc6b77c1 100644
--- a/solr/core/src/java/org/apache/solr/cli/PostTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/PostTool.java
@@ -16,15 +16,144 @@
  */
 package org.apache.solr.cli;
 
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.PrintStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.ProtocolException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.security.GeneralSecurityException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TimeZone;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
+import org.apache.solr.client.api.util.SolrVersion;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.util.Utils;
+import org.apache.solr.util.RTimer;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
 public class PostTool extends ToolBase {
 
+  public static final String DEFAULT_FILE_TYPES =
+      "xml,json,jsonl,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
+  static final String DATA_MODE_FILES = "files";
+  static final String DATA_MODE_ARGS = "args";
+  static final String DATA_MODE_STDIN = "stdin";
+  static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
+  static final String FORMAT_SOLR = "solr";
+  static final String DATA_MODE_WEB = "web";
+
+  private static final int DEFAULT_WEB_DELAY = 10;
+  private static final int MAX_WEB_DEPTH = 10;
+  public static final String DEFAULT_CONTENT_TYPE = "application/json";
+
+  // Input args
+  int recursive = 0;
+  int delay = 0;
+  String fileTypes = PostTool.DEFAULT_FILE_TYPES;
+  URL solrUpdateUrl;
+  String credentials;
+  OutputStream out = null;
+  String type;
+  String format;
+  String mode = DEFAULT_DATA_MODE;
+  boolean commit;
+  boolean optimize;
+  boolean dryRun; // Avoids actual network traffic to Solr
+
+  String[] args;
+
+  boolean auto = true;
+  private int currentDepth;
+
+  static HashMap<String, String> mimeMap;
+  FileFilter fileFilter;
+  // Backlog for crawling
+  List<LinkedHashSet<URI>> backlog = new ArrayList<>();
+  Set<URI> visited = new HashSet<>();
+
+  static final Set<String> DATA_MODES = new HashSet<>();
+
+  PostTool.PageFetcher pageFetcher = new PostTool.PageFetcher();
+
+  static {
+    DATA_MODES.add(DATA_MODE_FILES);
+    DATA_MODES.add(DATA_MODE_ARGS);
+    DATA_MODES.add(DATA_MODE_STDIN);
+    DATA_MODES.add(DATA_MODE_WEB);
+
+    mimeMap = new HashMap<>();
+    mimeMap.put("xml", "application/xml");
+    mimeMap.put("csv", "text/csv");
+    mimeMap.put("json", "application/json");
+    mimeMap.put("jsonl", "application/jsonl");
+    mimeMap.put("pdf", "application/pdf");
+    mimeMap.put("rtf", "text/rtf");
+    mimeMap.put("html", "text/html");
+    mimeMap.put("htm", "text/html");
+    mimeMap.put("doc", "application/msword");
+    mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+    mimeMap.put("ppt", "application/vnd.ms-powerpoint");
+    mimeMap.put(
+        "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+    mimeMap.put("xls", "application/vnd.ms-excel");
+    mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
+    mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
+    mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
+    mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
+    mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
+    mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
+    mimeMap.put("txt", "text/plain");
+    mimeMap.put("log", "text/plain");
+  }
+
   public PostTool() {
     this(CLIO.getOutStream());
   }
@@ -43,9 +172,10 @@ public class PostTool extends ToolBase {
     return List.of(
         Option.builder("url")
             .argName("url")
+            .longOpt("solr-update-url")
             .hasArg()
             .required(false)
-            .desc("<base Solr update URL>")
+            .desc("Solr Update URL, the full url to the update handler, including the /update.")
             .build(),
         Option.builder("c")
             .longOpt("name")
@@ -66,7 +196,8 @@ public class PostTool extends ToolBase {
             .argName("mode")
             .hasArg(true)
             .required(false)
-            .desc("Files crawls files, web crawls website. default: files.")
+            .desc(
+                "Files crawls files, web crawls website, args processes input args, and stdin reads a command from standard in. default: files.")
             .build(),
         Option.builder("recursive")
             .argName("recursive")
@@ -85,13 +216,13 @@ public class PostTool extends ToolBase {
             .argName("content-type")
             .hasArg(true)
             .required(false)
-            .desc("default: application/json")
+            .desc("Specify a specific mimetype to use, such as application/json.")
             .build(),
         Option.builder("filetypes")
             .argName("<type>[,<type>,...]")
             .hasArg(true)
             .required(false)
-            .desc("default: " + SimplePostTool.DEFAULT_FILE_TYPES)
+            .desc("default: " + DEFAULT_FILE_TYPES)
             .build(),
         Option.builder("params")
             .argName("<key>=<value>[&<key>=<value>...]")
@@ -107,6 +238,12 @@ public class PostTool extends ToolBase {
             .required(false)
             .desc(
                 "sends application/json content as Solr commands to /update instead of /update/json/docs.")
+            .build(),
+        Option.builder()
+            .longOpt("dry-run")
+            .required(false)
+            .desc(
+                "Performs a dry run of the posting process without actually sending documents to Solr.  Only works with files mode.")
             .build());
   }
 
@@ -114,52 +251,1027 @@ public class PostTool extends ToolBase {
   public void runImpl(CommandLine cli) throws Exception {
     SolrCLI.raiseLogLevelUnlessVerbose(cli);
 
-    URL solrUrl = null;
+    solrUpdateUrl = null;
     if (cli.hasOption("url")) {
       String url = cli.getOptionValue("url");
-      solrUrl = new URL(url);
+      solrUpdateUrl = new URL(url);
     } else if (cli.hasOption("c")) {
       String url = SolrCLI.getDefaultSolrUrl() + "/solr/" + cli.getOptionValue("c") + "/update";
-      solrUrl = new URL(url);
+      solrUpdateUrl = new URL(url);
     } else {
       throw new IllegalArgumentException(
           "Must specify either -url or -c parameter to post documents.");
     }
 
-    String mode = SimplePostTool.DEFAULT_DATA_MODE;
     if (cli.hasOption("mode")) {
       mode = cli.getOptionValue("mode");
     }
-    boolean auto = true;
-    String type = null;
+
+    if (cli.hasOption("dry-run")) {
+      dryRun = true;
+    }
+
     if (cli.hasOption("type")) {
       type = cli.getOptionValue("type");
+      // Turn off automatically looking up the mimetype in favour of what is passed in.
+      auto = false;
     }
-    String format =
-        cli.hasOption("format")
-            ? SimplePostTool.FORMAT_SOLR
-            : ""; // i.e not solr formatted json commands
+    format = cli.hasOption("format") ? FORMAT_SOLR : ""; // i.e not solr formatted json commands
 
-    String fileTypes = SimplePostTool.DEFAULT_FILE_TYPES;
     if (cli.hasOption("filetypes")) {
       fileTypes = cli.getOptionValue("filetypes");
     }
 
-    int defaultDelay = (mode.equals((SimplePostTool.DATA_MODE_WEB)) ? 10 : 0);
-    int delay = Integer.parseInt(cli.getOptionValue("delay", String.valueOf(defaultDelay)));
-    int recursive = Integer.parseInt(cli.getOptionValue("recursive", "1"));
+    int defaultDelay = (mode.equals((DATA_MODE_WEB)) ? 10 : 0);
+    delay = Integer.parseInt(cli.getOptionValue("delay", String.valueOf(defaultDelay)));
+    recursive = Integer.parseInt(cli.getOptionValue("recursive", "1"));
 
-    OutputStream out = cli.hasOption("out") ? CLIO.getOutStream() : null;
-    boolean commit = cli.hasOption("skipcommit") ? false : true;
-    boolean optimize = cli.hasOption("optimize");
+    out = cli.hasOption("out") ? CLIO.getOutStream() : null;
+    commit = cli.hasOption("skipcommit") ? false : true;
+    optimize = cli.hasOption("optimize");
 
-    String[] args = cli.getArgs();
+    args = cli.getArgs();
 
-    SimplePostTool spt =
-        new SimplePostTool(
-            mode, solrUrl, auto, type, format, recursive, delay, fileTypes, out, commit, optimize,
-            args);
+    execute();
+  }
+
+  /**
+   * After initialization, call execute to start the post job. This method delegates to the correct
+   * mode method.
+   */
+  public void execute() throws SolrServerException, IOException {
+    final RTimer timer = new RTimer();
+    if (PostTool.DATA_MODE_FILES.equals(mode)) {
+      doFilesMode();
+    } else if (DATA_MODE_ARGS.equals(mode)) {
+      doArgsMode(args);
+    } else if (PostTool.DATA_MODE_WEB.equals(mode)) {
+      doWebMode();
+    } else if (DATA_MODE_STDIN.equals(mode)) {
+      doStdinMode();
+    } else {
+      return;
+    }
+
+    if (commit) {
+      commit();
+    }
+    if (optimize) {
+      optimize();
+    }
+    displayTiming((long) timer.getTime());
+  }
+
+  private void doFilesMode() {
+    currentDepth = 0;
+
+    info(
+        "Posting files to [base] url "
+            + solrUpdateUrl
+            + (!auto ? " using content-type " + (type == null ? DEFAULT_CONTENT_TYPE : type) : "")
+            + "...");
+    if (auto) {
+      info("Entering auto mode. File endings considered are " + fileTypes);
+    }
+    if (recursive > 0) {
+      info("Entering recursive mode, max depth=" + recursive + ", delay=" + delay + "s");
+    }
+    fileFilter = getFileFilterFromFileTypes(fileTypes);
+    int numFilesPosted = postFiles(args, 0, out, type);
+    if (dryRun) {
+      info("Dry run complete. " + numFilesPosted + " would have been indexed.");
+    } else {
+      info(numFilesPosted + " files indexed.");
+    }
+  }
+
+  private void doArgsMode(String[] args) {
+    info("POSTing args to " + solrUpdateUrl + "...");
+    for (String a : args) {
+      postData(stringToStream(a), null, out, type, solrUpdateUrl);
+    }
+  }
+
+  private void doWebMode() {
+    reset();
+    int numPagesPosted = 0;
+    try {
+      if (type != null) {
+        throw new IllegalArgumentException(
+            "Specifying content-type with \"-Ddata=web\" is not supported");
+      }
+
+      // Set Extracting handler as default
+      solrUpdateUrl = appendUrlPath(solrUpdateUrl, "/extract");
+
+      info("Posting web pages to Solr url " + solrUpdateUrl);
+      auto = true;
+      info(
+          "Entering auto mode. Indexing pages with content-types corresponding to file endings "
+              + fileTypes);
+      if (recursive > 0) {
+        if (recursive > MAX_WEB_DEPTH) {
+          recursive = MAX_WEB_DEPTH;
+          warn("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "...");
+        }
+        if (delay < DEFAULT_WEB_DELAY) {
+          warn(
+              "Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
+        }
+        info("Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s");
+      }
+      numPagesPosted = postWebPages(args, 0, out);
+      info(numPagesPosted + " web pages indexed.");
+
+    } catch (MalformedURLException e) {
+      warn("Wrong URL trying to append /extract to " + solrUpdateUrl);
+    }
+  }
+
+  private void doStdinMode() {
+    info("POSTing stdin to " + solrUpdateUrl + "...");
+    postData(System.in, null, out, type, solrUpdateUrl);
+  }
+
+  private void reset() {
+    backlog = new ArrayList<>();
+    visited = new HashSet<>();
+  }
+
+  /**
+   * Pretty prints the number of milliseconds taken to post the content to Solr
+   *
+   * @param millis the time in milliseconds
+   */
+  private void displayTiming(long millis) {
+    SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault());
+    df.setTimeZone(TimeZone.getTimeZone("UTC"));
+    CLIO.out("Time spent: " + df.format(new Date(millis)));
+  }
+
+  private boolean checkIsValidPath(File srcFile) {
+    return Files.exists(srcFile.toPath());
+  }
+
+  /**
+   * Post all filenames provided in args
+   *
+   * @param args array of file names
+   * @param startIndexInArgs offset to start
+   * @param out output stream to post data to
+   * @param type default content-type to use when posting (may be overridden in auto mode)
+   * @return number of files posted
+   */
+  public int postFiles(String[] args, int startIndexInArgs, OutputStream out, String type) {
+    reset();
+    int filesPosted = 0;
+    for (int j = startIndexInArgs; j < args.length; j++) {
+      File srcFile = new File(args[j]);
+      filesPosted = getFilesPosted(out, type, srcFile);
+    }
+    return filesPosted;
+  }
+
+  private int getFilesPosted(final OutputStream out, final String type, final File srcFile) {
+    int filesPosted = 0;
+    boolean isValidPath = checkIsValidPath(srcFile);
+    if (isValidPath && srcFile.isDirectory() && srcFile.canRead()) {
+      filesPosted += postDirectory(srcFile, out, type);
+    } else if (isValidPath && srcFile.isFile() && srcFile.canRead()) {
+      filesPosted += postFiles(new File[] {srcFile}, out, type);
+    } else {
+      filesPosted += handleGlob(srcFile, out, type);
+    }
+    return filesPosted;
+  }
+
+  /**
+   * Posts a whole directory
+   *
+   * @return number of files posted total
+   */
+  private int postDirectory(File dir, OutputStream out, String type) {
+    if (dir.isHidden() && !dir.getName().equals(".")) {
+      return (0);
+    }
+    info(
+        "Indexing directory "
+            + dir.getPath()
+            + " ("
+            + dir.listFiles(fileFilter).length
+            + " files, depth="
+            + currentDepth
+            + ")");
+    int posted = 0;
+    posted += postFiles(dir.listFiles(fileFilter), out, type);
+    if (recursive > currentDepth) {
+      for (File d : dir.listFiles()) {
+        if (d.isDirectory()) {
+          currentDepth++;
+          posted += postDirectory(d, out, type);
+          currentDepth--;
+        }
+      }
+    }
+    return posted;
+  }
+
+  /**
+   * Posts a list of file names
+   *
+   * @return number of files posted
+   */
+  int postFiles(File[] files, OutputStream out, String type) {
+    int filesPosted = 0;
+    for (File srcFile : files) {
+      try {
+        if (!srcFile.isFile() || srcFile.isHidden()) {
+          continue;
+        }
+        postFile(srcFile, out, type);
+        Thread.sleep(delay * 1000L);
+        filesPosted++;
+      } catch (InterruptedException | MalformedURLException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    return filesPosted;
+  }
+
+  /**
+   * This only handles file globs not full path globbing.
+   *
+   * @param globFile file holding glob path
+   * @param out outputStream to write results to
+   * @param type default content-type to use when posting (may be overridden in auto mode)
+   * @return number of files posted
+   */
+  int handleGlob(File globFile, OutputStream out, String type) {
+    int filesPosted = 0;
+    File parent = globFile.getParentFile();
+    if (parent == null) {
+      parent = new File(".");
+    }
+    String fileGlob = globFile.getName();
+    PostTool.GlobFileFilter ff = new PostTool.GlobFileFilter(fileGlob, false);
+    File[] fileList = parent.listFiles(ff);
+    if (fileList == null || fileList.length == 0) {
+      warn("No files or directories matching " + globFile);
+    } else {
+      filesPosted = postFiles(fileList, out, type);
+    }
+    return filesPosted;
+  }
+
+  /**
+   * This method takes as input a list of start URL strings for crawling, converts the URL strings
+   * to URI strings and adds each one to a backlog and then starts crawling
+   *
+   * @param args the raw input args from main()
+   * @param startIndexInArgs offset for where to start
+   * @param out outputStream to write results to
+   * @return the number of web pages posted
+   */
+  public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
+    reset();
+    LinkedHashSet<URI> s = new LinkedHashSet<>();
+    for (int j = startIndexInArgs; j < args.length; j++) {
+      try {
+        URI uri = new URI(normalizeUrlEnding(args[j]));
+        s.add(uri);
+      } catch (URISyntaxException e) {
+        warn("Skipping malformed input URL: " + args[j]);
+      }
+    }
+    // Add URIs to level 0 of the backlog and start recursive crawling
+    backlog.add(s);
+    return webCrawl(0, out);
+  }
+
+  /**
+   * Normalizes a URL string by removing anchor part and trailing slash
+   *
+   * @return the normalized URL string
+   */
+  protected static String normalizeUrlEnding(String link) {
+    if (link.contains("#")) {
+      link = link.substring(0, link.indexOf('#'));
+    }
+    if (link.endsWith("?")) {
+      link = link.substring(0, link.length() - 1);
+    }
+    if (link.endsWith("/")) {
+      link = link.substring(0, link.length() - 1);
+    }
+    return link;
+  }
+
+  /**
+   * A very simple crawler, pulling URLs to fetch from a backlog and then recurses N levels deep if
+   * recursive&gt;0. Links are parsed from HTML through first getting an XHTML version using
+   * SolrCell with extractOnly, and followed if they are local. The crawler pauses for a default
+   * delay of 10 seconds between each fetch, this can be configured in the delay variable. This is
+   * only meant for test purposes, as it does not respect robots or anything else fancy :)
+   *
+   * @param level which level to crawl
+   * @param out output stream to write to
+   * @return number of pages crawled on this level and below
+   */
+  protected int webCrawl(int level, OutputStream out) {
+    int numPages = 0;
+    LinkedHashSet<URI> stack = backlog.get(level);
+    int rawStackSize = stack.size();
+    stack.removeAll(visited);
+    int stackSize = stack.size();
+    LinkedHashSet<URI> subStack = new LinkedHashSet<>();
+    info(
+        "Entering crawl at level "
+            + level
+            + " ("
+            + rawStackSize
+            + " links total, "
+            + stackSize
+            + " new)");
+    for (URI uri : stack) {
+      try {
+        visited.add(uri);
+        URL url = uri.toURL();
+        PostTool.PageFetcherResult result = pageFetcher.readPageFromUrl(url);
+        if (result.httpStatus == 200) {
+          url = (result.redirectUrl != null) ? result.redirectUrl : url;
+          URL postUrl =
+              new URL(
+                  appendParam(
+                      solrUpdateUrl.toString(),
+                      "literal.id="
+                          + URLEncoder.encode(url.toString(), UTF_8)
+                          + "&literal.url="
+                          + URLEncoder.encode(url.toString(), UTF_8)));
+          ByteBuffer content = result.content;
+          boolean success =
+              postData(
+                  new ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()),
+                  null,
+                  out,
+                  result.contentType,
+                  postUrl);
+          if (success) {
+            info("POSTed web resource " + url + " (depth: " + level + ")");
+            Thread.sleep(delay * 1000L);
+            numPages++;
+            // Pull links from HTML pages only
+            if (recursive > level && result.contentType.equals("text/html")) {
+              Set<URI> children =
+                  pageFetcher.getLinksFromWebPage(
+                      url,
+                      new ByteArrayInputStream(
+                          content.array(), content.arrayOffset(), content.limit()),
+                      result.contentType,
+                      postUrl);
+              subStack.addAll(children);
+            }
+          } else {
+            warn("An error occurred while posting " + uri);
+          }
+        } else {
+          warn("The URL " + uri + " returned a HTTP result status of " + result.httpStatus);
+        }
+      } catch (IOException | URISyntaxException e) {
+        warn("Caught exception when trying to open connection to " + uri + ": " + e.getMessage());
+      } catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    if (!subStack.isEmpty()) {
+      backlog.add(subStack);
+      numPages += webCrawl(level + 1, out);
+    }
+    return numPages;
+  }
+
+  /**
+   * Computes the full URL based on a base url and a possibly relative link found in the href param
+   * of an HTML anchor.
+   *
+   * @param baseUrl the base url from where the link was found
+   * @param link the absolute or relative link
+   * @return the string version of the full URL
+   */
+  protected String computeFullUrl(URL baseUrl, String link) {
+    if (link == null || link.length() == 0) {
+      return null;
+    }
+    if (!link.startsWith("http")) {
+      if (link.startsWith("/")) {
+        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
+      } else {
+        if (link.contains(":")) {
+          return null; // Skip non-relative URLs
+        }
+        String path = baseUrl.getPath();
+        if (!path.endsWith("/")) {
+          int sep = path.lastIndexOf('/');
+          String file = path.substring(sep + 1);
+          if (file.contains(".") || file.contains("?")) {
+            path = path.substring(0, sep);
+          }
+        }
+        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
+      }
+    }
+    link = normalizeUrlEnding(link);
+    String l = link.toLowerCase(Locale.ROOT);
+    // Simple brute force skip images
+    if (l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
+      return null; // Skip images
+    }
+    return link;
+  }
+
+  /**
+   * Uses the mime-type map to reverse lookup whether the file ending for our type is supported by
+   * the fileTypes option
+   *
+   * @param type what content-type to lookup
+   * @return true if this is a supported content type
+   */
+  protected boolean typeSupported(String type) {
+    for (Map.Entry<String, String> entry : mimeMap.entrySet()) {
+      if (entry.getValue().equals(type)) {
+        if (fileTypes.contains(entry.getKey())) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  static void warn(String msg) {
+    CLIO.err("PostTool: WARNING: " + msg);
+  }
+
+  static void info(String msg) {
+    CLIO.out(msg);
+  }
+
+  /** Does a simple commit operation */
+  public void commit() throws IOException, SolrServerException {
+    info("COMMITting Solr index changes to " + solrUpdateUrl + "...");
+    String url = solrUpdateUrl.toString();
+    url = url.substring(0, url.lastIndexOf("/update"));
+    try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+      client.commit();
+    }
+  }
+
+  /** Does a simple optimize operation */
+  public void optimize() throws IOException, SolrServerException {
+    info("Performing an OPTIMIZE to " + solrUpdateUrl + "...");
+    String url = solrUpdateUrl.toString();
+    url = url.substring(0, url.lastIndexOf("/update"));
+    try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+      client.optimize();
+    }
+  }
+
+  /**
+   * Appends a URL query parameter to a URL
+   *
+   * @param url the original URL
+   * @param param the parameter(s) to append, separated by "&amp;"
+   * @return the string version of the resulting URL
+   */
+  public static String appendParam(String url, String param) {
+    String[] pa = param.split("&");
+    for (String p : pa) {
+      if (p.trim().length() == 0) {
+        continue;
+      }
+      String[] kv = p.split("=");
+      if (kv.length == 2) {
+        url = url + (url.contains("?") ? "&" : "?") + kv[0] + "=" + kv[1];
+      } else {
+        warn("Skipping param " + p + " which is not on form key=value");
+      }
+    }
+    return url;
+  }
+
+  /** Opens the file and posts its contents to the solrUrl, writes to response to output. */
+  public void postFile(File file, OutputStream output, String type) throws MalformedURLException {
+    InputStream is = null;
+
+    URL url = solrUpdateUrl;
+    String suffix = "";
+    if (auto) {
+      if (type == null) {
+        type = guessType(file);
+      }
+      // TODO: Add a flag that disables /update and sends all to /update/extract, to avoid CSV,
+      // JSON, and XML files
+      // TODO: from being interpreted as Solr documents internally
+      if (type.equals("application/json") && !PostTool.FORMAT_SOLR.equals(format)) {
+        suffix = "/json/docs";
+        String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString();
+        url = new URL(urlStr);
+      } else if (type.equals("application/xml")
+          || type.equals("text/csv")
+          || type.equals("application/json")) {
+        // Default handler
+      } else {
+        // SolrCell
+        suffix = "/extract";
+        String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString();
+        if (!urlStr.contains("resource.name")) {
+          urlStr =
+              appendParam(
+                  urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), UTF_8));
+        }
+        if (!urlStr.contains("literal.id")) {
+          urlStr =
+              appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), UTF_8));
+        }
+        url = new URL(urlStr);
+      }
+    } else {
+      if (type == null) {
+        type = DEFAULT_CONTENT_TYPE;
+      }
+    }
+    if (dryRun) {
+      info(
+          "DRY RUN of POSTing file "
+              + file.getName()
+              + (auto ? " (" + type + ")" : "")
+              + " to [base]"
+              + suffix);
+    } else {
+      try {
+        info(
+            "POSTing file "
+                + file.getName()
+                + (auto ? " (" + type + ")" : "")
+                + " to [base]"
+                + suffix);
+        is = new FileInputStream(file);
+        postData(is, file.length(), output, type, url);
+      } catch (IOException e) {
+        warn("Can't open/read file: " + file);
+      } finally {
+        try {
+          if (is != null) {
+            is.close();
+          }
+        } catch (IOException e) {
+          warn("IOException while closing file: " + e);
+        }
+      }
+    }
+  }
+
+  /**
+   * Appends to the path of the URL
+   *
+   * @param url the URL
+   * @param append the path to append
+   * @return the final URL version
+   */
+  protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
+    return new URL(
+        url.getProtocol()
+            + "://"
+            + url.getAuthority()
+            + url.getPath()
+            + append
+            + (url.getQuery() != null ? "?" + url.getQuery() : ""));
+  }
+
+  /**
+   * Guesses the type of file, based on file name suffix Returns "application/octet-stream" if no
+   * corresponding mimeMap type.
+   *
+   * @param file the file
+   * @return the content-type guessed
+   */
+  protected static String guessType(File file) {
+    String name = file.getName();
+    String suffix = name.substring(name.lastIndexOf('.') + 1);
+    String type = mimeMap.get(suffix.toLowerCase(Locale.ROOT));
+    return (type != null) ? type : "application/octet-stream";
+  }
+
+  /**
+   * Reads data from the data stream and posts it to solr, writes to the response to output
+   *
+   * @return true if success
+   */
+  public boolean postData(
+      InputStream data, Long length, OutputStream output, String type, URL url) {
+    if (dryRun) {
+      return true;
+    }
+
+    boolean success = true;
+    if (type == null) {
+      type = DEFAULT_CONTENT_TYPE;
+    }
+    HttpURLConnection urlConnection = null;
+    try {
+      try {
+        urlConnection = (HttpURLConnection) url.openConnection();
+        try {
+          urlConnection.setRequestMethod("POST");
+        } catch (ProtocolException e) {
+          warn("Shouldn't happen: HttpURLConnection doesn't support POST??" + e);
+        }
+        urlConnection.setDoOutput(true);
+        urlConnection.setDoInput(true);
+        urlConnection.setUseCaches(false);
+        urlConnection.setAllowUserInteraction(false);
+        urlConnection.setRequestProperty("Content-type", type);
+        basicAuth(urlConnection);
+        if (null != length) {
+          urlConnection.setFixedLengthStreamingMode(length);
+        } else {
+          urlConnection.setChunkedStreamingMode(-1); // use JDK default chunkLen, 4k in Java 8.
+        }
+        urlConnection.connect();
+      } catch (IOException e) {
+        warn("Connection error (is Solr running at " + solrUpdateUrl + " ?): " + e);
+        success = false;
+      } catch (Exception e) {
+        warn("POST failed with error " + e.getMessage());
+      }
+
+      try (final OutputStream out = urlConnection.getOutputStream()) {
+        pipe(data, out);
+      } catch (IOException e) {
+        warn("IOException while posting data: " + e);
+      }
+
+      try {
+        success &= checkResponseCode(urlConnection);
+        try (final InputStream in = urlConnection.getInputStream()) {
+          pipe(in, output);
+        }
+      } catch (IOException e) {
+        warn("IOException while reading response: " + e);
+        success = false;
+      } catch (GeneralSecurityException e) {
+        warn(
+            "Looks like Solr is secured and would not let us in. Try with another user in '-u' parameter");
+      }
+    } finally {
+      if (urlConnection != null) {
+        urlConnection.disconnect();
+      }
+    }
+    return success;
+  }
+
+  private void basicAuth(HttpURLConnection urlc) throws Exception {
+    if (urlc.getURL().getUserInfo() != null) {
+      String encoding =
+          Base64.getEncoder().encodeToString(urlc.getURL().getUserInfo().getBytes(US_ASCII));
+      urlc.setRequestProperty("Authorization", "Basic " + encoding);
+    } else if (credentials != null) {
+      if (!credentials.contains(":")) {
+        throw new Exception("credentials '" + credentials + "' must be of format user:pass");
+      }
+      urlc.setRequestProperty(
+          "Authorization",
+          "Basic " + Base64.getEncoder().encodeToString(credentials.getBytes(UTF_8)));
+    }
+  }
+
+  private static boolean checkResponseCode(HttpURLConnection urlc)
+      throws IOException, GeneralSecurityException {
+    if (urlc.getResponseCode() >= 400) {
+      warn(
+          "Solr returned an error #"
+              + urlc.getResponseCode()
+              + " ("
+              + urlc.getResponseMessage()
+              + ") for url: "
+              + urlc.getURL());
+      Charset charset = StandardCharsets.ISO_8859_1;
+      final String contentType = urlc.getContentType();
+      // code cloned from ContentStreamBase, but post.jar should be standalone!
+      if (contentType != null) {
+        int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset=");
+        if (idx > 0) {
+          charset = Charset.forName(contentType.substring(idx + "charset=".length()).trim());
+        }
+      }
+      // Print the response returned by Solr
+      try (InputStream errStream = urlc.getErrorStream()) {
+        if (errStream != null) {
+          BufferedReader br = new BufferedReader(new InputStreamReader(errStream, charset));
+          final StringBuilder response = new StringBuilder("Response: ");
+          int ch;
+          while ((ch = br.read()) != -1) {
+            response.append((char) ch);
+          }
+          warn(response.toString().trim());
+        }
+      }
+      if (urlc.getResponseCode() == 401) {
+        throw new GeneralSecurityException(
+            "Solr requires authentication (response 401). Please try again with '-u' option");
+      }
+      if (urlc.getResponseCode() == 403) {
+        throw new GeneralSecurityException(
+            "You are not authorized to perform this action against Solr. (response 403)");
+      }
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * Converts a string to an input stream
+   *
+   * @param s the string
+   * @return the input stream
+   */
+  public static InputStream stringToStream(String s) {
+    return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
+  }
+
+  /**
+   * Pipes everything from the source to the dest. If dest is null, then everything is read from
+   * source and thrown away.
+   */
+  private static void pipe(InputStream source, OutputStream dest) throws IOException {
+    byte[] buf = new byte[1024];
+    int read = 0;
+    while ((read = source.read(buf)) >= 0) {
+      if (null != dest) {
+        dest.write(buf, 0, read);
+      }
+    }
+    if (null != dest) {
+      dest.flush();
+    }
+  }
+
+  public FileFilter getFileFilterFromFileTypes(String fileTypes) {
+    String glob;
+    if (fileTypes.equals("*")) {
+      glob = ".*";
+    } else {
+      glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
+    }
+    return new PostTool.GlobFileFilter(glob, true);
+  }
+
+  //
+  // Utility methods for XPath handing
+  //
+
+  /** Gets all nodes matching an XPath */
+  public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
+    XPathFactory factory = XPathFactory.newInstance();
+    XPath xp = factory.newXPath();
+    XPathExpression expr = xp.compile(xpath);
+    return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
+  }
+
+  /**
+   * Gets the string content of the matching an XPath
+   *
+   * @param n the node (or doc)
+   * @param xpath the xpath string
+   * @param concatAll if true, text from all matching nodes will be concatenated, else only the
+   *     first returned
+   */
+  public static String getXP(Node n, String xpath, boolean concatAll)
+      throws XPathExpressionException {
+    NodeList nodes = getNodesFromXP(n, xpath);
+    StringBuilder sb = new StringBuilder();
+    if (nodes.getLength() > 0) {
+      for (int i = 0; i < nodes.getLength(); i++) {
+        sb.append(nodes.item(i).getNodeValue()).append(' ');
+        if (!concatAll) {
+          break;
+        }
+      }
+      return sb.toString().trim();
+    } else return "";
+  }
+
+  /** Takes a string as input and returns a DOM */
+  public static Document makeDom(byte[] in)
+      throws SAXException, IOException, ParserConfigurationException {
+    InputStream is = new ByteArrayInputStream(in);
+    Document dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is);
+    return dom;
+  }
+
+  /** Inner class to filter files based on glob wildcards */
+  static class GlobFileFilter implements FileFilter {
+    private final Pattern p;
+
+    public GlobFileFilter(String pattern, boolean isRegex) {
+      String _pattern = pattern;
+      if (!isRegex) {
+        _pattern =
+            _pattern
+                .replace("^", "\\^")
+                .replace("$", "\\$")
+                .replace(".", "\\.")
+                .replace("(", "\\(")
+                .replace(")", "\\)")
+                .replace("+", "\\+")
+                .replace("*", ".*")
+                .replace("?", ".");
+        _pattern = "^" + _pattern + "$";
+      }
+
+      try {
+        p = Pattern.compile(_pattern, Pattern.CASE_INSENSITIVE);
+      } catch (PatternSyntaxException e) {
+        throw new IllegalArgumentException(
+            "Invalid type list " + pattern + ". " + e.getDescription());
+      }
+    }
+
+    @Override
+    public boolean accept(File file) {
+      return p.matcher(file.getName()).find();
+    }
+  }
+
+  //
+  // Simple crawler class which can fetch a page and check for robots.txt
+  //
+  class PageFetcher {
+    Map<String, List<String>> robotsCache;
+    static final String DISALLOW = "Disallow:";
+
+    public PageFetcher() {
+      robotsCache = new HashMap<>();
+    }
+
+    public PageFetcherResult readPageFromUrl(URL u) throws URISyntaxException {
+      PostTool.PageFetcherResult res = new PostTool.PageFetcherResult();
+      try {
+        if (isDisallowedByRobots(u)) {
+          warn("The URL " + u + " is disallowed by robots.txt and will not be crawled.");
+          res.httpStatus = 403;
+          URI uri = u.toURI();
+          visited.add(uri);
+          return res;
+        }
+        res.httpStatus = 404;
+        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
+        conn.setRequestProperty(
+            "User-Agent",
+            "PostTool-crawler/" + SolrVersion.LATEST_STRING + " (https://solr.apache.org/)");
+        conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+        conn.connect();
+        res.httpStatus = conn.getResponseCode();
+        if (!normalizeUrlEnding(conn.getURL().toString())
+            .equals(normalizeUrlEnding(u.toString()))) {
+          info("The URL " + u + " caused a redirect to " + conn.getURL());
+          u = conn.getURL();
+          res.redirectUrl = u;
+          URI uri = u.toURI();
+          visited.add(uri);
+        }
+        if (res.httpStatus == 200) {
+          // Raw content type of form "text/html; encoding=utf-8"
+          String rawContentType = conn.getContentType();
+          String type = rawContentType.split(";")[0];
+          if (typeSupported(type) || "*".equals(fileTypes)) {
+            String encoding = conn.getContentEncoding();
+            InputStream is;
+            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+              is = new GZIPInputStream(conn.getInputStream());
+            } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
+              is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
+            } else {
+              is = conn.getInputStream();
+            }
+
+            // Read into memory, so that we later can pull links from the page without re-fetching
+            res.content = Utils.toByteArray(is);
+            is.close();
+          } else {
+            warn("Skipping URL with unsupported type " + type);
+            res.httpStatus = 415;
+          }
+        }
+      } catch (IOException e) {
+        warn("IOException when reading page from url " + u + ": " + e.getMessage());
+      }
+      return res;
+    }
+
+    public boolean isDisallowedByRobots(URL url) {
+      String host = url.getHost();
+      String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
+      List<String> disallows = robotsCache.get(host);
+      if (disallows == null) {
+        disallows = new ArrayList<>();
+        URL urlRobot;
+        try {
+          urlRobot = new URL(strRobot);
+          disallows = parseRobotsTxt(urlRobot.openStream());
+        } catch (MalformedURLException e) {
+          return true; // We cannot trust this robots URL, should not happen
+        } catch (IOException e) {
+          // There is no robots.txt, will cache an empty disallow list
+        }
+      }
+
+      robotsCache.put(host, disallows);
+
+      String strURL = url.getFile();
+      for (String path : disallows) {
+        if (path.equals("/") || strURL.indexOf(path) == 0) return true;
+      }
+      return false;
+    }
+
+    /**
+     * Very simple robots.txt parser which obeys all Disallow lines regardless of user agent or
+     * whether there are valid Allow: lines.
+     *
+     * @param is Input stream of the robots.txt file
+     * @return a list of disallow paths
+     * @throws IOException if problems reading the stream
+     */
+    protected List<String> parseRobotsTxt(InputStream is) throws IOException {
+      List<String> disallows = new ArrayList<>();
+      BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
+      String l;
+      while ((l = r.readLine()) != null) {
+        String[] arr = l.split("#");
+        if (arr.length == 0) continue;
+        l = arr[0].trim();
+        if (l.startsWith(DISALLOW)) {
+          l = l.substring(DISALLOW.length()).trim();
+          if (l.length() == 0) continue;
+          disallows.add(l);
+        }
+      }
+      is.close();
+      return disallows;
+    }
+
+    /**
+     * Finds links on a web page, using /extract?extractOnly=true
+     *
+     * @param url the URL of the web page
+     * @param is the input stream of the page
+     * @param type the content-type
+     * @param postUrl the URL (typically /solr/extract) in order to pull out links
+     * @return a set of URIs parsed from the page
+     */
+    protected Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, URL postUrl) {
+      Set<URI> linksFromPage = new HashSet<>();
+
+      try {
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
+        extractUrl = new URL(appendParam(extractUrl.toString(), "wt=xml"));
+        boolean success = postData(is, null, os, type, extractUrl);
+        if (success) {
+          Document d = makeDom(os.toByteArray());
+          String innerXml = getXP(d, "/response/str/text()[1]", false);
+          d = makeDom(innerXml.getBytes(StandardCharsets.UTF_8));
+          NodeList links = getNodesFromXP(d, "/html/body//a/@href");
+          for (int i = 0; i < links.getLength(); i++) {
+            String link = links.item(i).getTextContent();
+            link = computeFullUrl(url, link);
+            if (link == null) {
+              continue;
+            }
+            URI newUri = new URI(link);
+            if (newUri.getAuthority() == null
+                || !newUri.getAuthority().equals(url.getAuthority())) {
+              linksFromPage.add(newUri);
+            }
+          }
+        }
+      } catch (MalformedURLException e) {
+        warn("Malformed URL " + url);
+      } catch (IOException e) {
+        warn("IOException opening URL " + url + ": " + e.getMessage());
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+
+      return linksFromPage;
+    }
+  }
 
-    spt.execute();
+  /** Utility class to hold the result form a page fetch */
+  public static class PageFetcherResult {
+    int httpStatus = 200;
+    String contentType = "text/html";
+    URL redirectUrl = null;
+    ByteBuffer content;
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
index 908990db22d..680a879372e 100644
--- a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
@@ -315,22 +315,20 @@ public class RunExampleTool extends ToolBase {
         String updateUrl = String.format(Locale.ROOT, "%s/%s/update", solrUrl, collectionName);
         echo("Indexing tech product example docs from " + exampledocsDir.getAbsolutePath());
 
-        String currentPropVal = System.getProperty("url");
-        System.setProperty("url", updateUrl);
-        String currentTypeVal = System.getProperty("type");
-        // We assume that example docs are always in XML.
-        System.setProperty("type", "application/xml");
-        SimplePostTool.main(new String[] {exampledocsDir.getAbsolutePath() + "/*.xml"});
-        if (currentPropVal != null) {
-          System.setProperty("url", currentPropVal); // reset
-        } else {
-          System.clearProperty("url");
-        }
-        if (currentTypeVal != null) {
-          System.setProperty("type", currentTypeVal); // reset
-        } else {
-          System.clearProperty("type");
-        }
+        String[] args =
+            new String[] {
+              "post",
+              "-url",
+              updateUrl,
+              "-type",
+              "application/xml",
+              exampledocsDir.getAbsolutePath() + "/*.xml"
+            };
+        PostTool postTool = new PostTool();
+        CommandLine postToolCli =
+            SolrCLI.parseCmdLine(postTool.getName(), args, postTool.getOptions());
+        postTool.runTool(postToolCli);
+
       } else {
         echo(
             "exampledocs directory not found, skipping indexing step for the techproducts example");
diff --git a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
index 88639c7cefc..e11c11884f1 100644
--- a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
+++ b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
@@ -20,10 +20,22 @@ package org.apache.solr.cli;
 import static org.apache.solr.cli.SolrCLI.findTool;
 import static org.apache.solr.cli.SolrCLI.parseCmdLine;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import org.apache.commons.cli.CommandLine;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -33,6 +45,11 @@ import org.apache.solr.common.util.Utils;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+/**
+ * NOTE: do *not* use real hostnames, not even "example.com", in the webcrawler tests.
+ *
+ * <p>A MockPageFetcher is used to prevent real HTTP requests from being executed.
+ */
 @SolrTestCaseJ4.SuppressSSL
 public class PostToolTest extends SolrCloudTestCase {
 
@@ -58,7 +75,7 @@ public class PostToolTest extends SolrCloudTestCase {
 
     String[] args = {
       "post",
-      "-url",
+      "--solr-update-url",
       cluster.getJettySolrRunner(0).getBaseUrl() + "/" + collection + "/update",
       jsonDoc.getAbsolutePath()
     };
@@ -90,4 +107,205 @@ public class PostToolTest extends SolrCloudTestCase {
     CommandLine cli = parseCmdLine(tool.getName(), args, tool.getOptions());
     return tool.runTool(cli);
   }
+
+  @Test
+  public void testNormalizeUrlEnding() {
+    assertEquals("http://[ff01::114]", PostTool.normalizeUrlEnding("http://[ff01::114]/"));
+    assertEquals(
+        "http://[ff01::114]", PostTool.normalizeUrlEnding("http://[ff01::114]/#foo?bar=baz"));
+    assertEquals(
+        "http://[ff01::114]/index.html",
+        PostTool.normalizeUrlEnding("http://[ff01::114]/index.html#hello"));
+  }
+
+  @Test
+  public void testComputeFullUrl() throws IOException {
+
+    PostTool webPostTool = new PostTool();
+
+    assertEquals(
+        "http://[ff01::114]/index.html",
+        webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "/index.html"));
+    assertEquals(
+        "http://[ff01::114]/index.html",
+        webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo/bar/"), "/index.html"));
+    assertEquals(
+        "http://[ff01::114]/fil.html",
+        webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo.htm?baz#hello"), "fil.html"));
+    //    TODO: How to know what is the base if URL path ends with "foo"??
+    //    assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new
+    // URL("http://[ff01::114]/foo?baz#hello"), "fil.html"));
+    assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "fil.jpg"));
+    assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "mailto:hello@foo.bar"));
+    assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "ftp://server/file"));
+  }
+
+  @Test
+  public void testTypeSupported() {
+    PostTool postTool = new PostTool();
+
+    assertTrue(postTool.typeSupported("application/pdf"));
+    assertTrue(postTool.typeSupported("application/xml"));
+    assertFalse(postTool.typeSupported("text/foo"));
+
+    postTool.fileTypes = "doc,xls,ppt";
+    postTool.fileFilter = postTool.getFileFilterFromFileTypes(postTool.fileTypes);
+    assertFalse(postTool.typeSupported("application/pdf"));
+    assertTrue(postTool.typeSupported("application/msword"));
+  }
+
+  @Test
+  public void testAppendParam() {
+    assertEquals(
+        "http://[ff01::114]?foo=bar", PostTool.appendParam("http://[ff01::114]", "foo=bar"));
+    assertEquals(
+        "http://[ff01::114]/?a=b&foo=bar",
+        PostTool.appendParam("http://[ff01::114]/?a=b", "foo=bar"));
+  }
+
+  @Test
+  public void testAppendUrlPath() throws MalformedURLException {
+    assertEquals(
+        new URL("http://[ff01::114]/a?foo=bar"),
+        PostTool.appendUrlPath(new URL("http://[ff01::114]?foo=bar"), "/a"));
+  }
+
+  @Test
+  public void testGuessType() {
+    File f = new File("foo.doc");
+    assertEquals("application/msword", PostTool.guessType(f));
+    f = new File("foobar");
+    assertEquals("application/octet-stream", PostTool.guessType(f));
+    f = new File("foo.json");
+    assertEquals("application/json", PostTool.guessType(f));
+  }
+
+  @Test
+  public void testDoFilesMode() throws MalformedURLException {
+    PostTool postTool = new PostTool();
+    postTool.recursive = 0;
+    postTool.dryRun = true;
+    postTool.solrUpdateUrl = new URL("http://localhost:8983/solr/fake/update");
+    File dir = getFile("exampledocs");
+    int num = postTool.postFiles(new String[] {dir.toString()}, 0, null, null);
+    assertEquals(2, num);
+  }
+
+  @Test
+  public void testDoWebMode() throws IOException, URISyntaxException {
+    PostTool postTool = new PostTool();
+    postTool.pageFetcher = new MockPageFetcher();
+    postTool.dryRun = true;
+    postTool.solrUpdateUrl = new URL("http://user:password@localhost:5150/solr/fake/update");
+
+    // Uses mock pageFetcher
+    postTool.delay = 0;
+    postTool.recursive = 5;
+    int num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null);
+    assertEquals(5, num);
+
+    postTool.recursive = 1;
+    num = postTool.postWebPages(new String[] {"http://[ff01::114]/"}, 0, null);
+    assertEquals(3, num);
+
+    // Without respecting robots.txt
+    postTool.pageFetcher.robotsCache.put("[ff01::114]", Collections.emptyList());
+    postTool.recursive = 5;
+    num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null);
+    assertEquals(6, num);
+  }
+
+  @Test
+  public void testRobotsExclusion() throws IOException, URISyntaxException {
+    PostTool postTool = new PostTool();
+    postTool.pageFetcher = new MockPageFetcher();
+    postTool.dryRun = true;
+
+    assertFalse(postTool.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/")));
+    assertTrue(postTool.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/disallowed")));
+    assertEquals(
+        "There should be two entries parsed from robots.txt",
+        2,
+        postTool.pageFetcher.robotsCache.get("[ff01::114]").size());
+  }
+
+  static class MockPageFetcher extends PostTool.PageFetcher {
+    HashMap<String, String> htmlMap = new HashMap<>();
+    HashMap<String, Set<URI>> linkMap = new HashMap<>();
+
+    public MockPageFetcher() throws IOException, URISyntaxException {
+      (new PostTool()).super();
+      htmlMap.put(
+          "http://[ff01::114]",
+          "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/index.html",
+          "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page1",
+          "<html><body><a href=\"http://[ff01::114]/page1/foo\"></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page1/foo",
+          "<html><body><a href=\"http://[ff01::114]/page1/foo/bar\"></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page1/foo/bar",
+          "<html><body><a href=\"http://[ff01::114]/page1\"></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page2",
+          "<html><body><a href=\"http://[ff01::114]/\"><a href=\"http://[ff01::114]/disallowed\"/></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/disallowed",
+          "<html><body><a href=\"http://[ff01::114]/\"></body></html>");
+
+      Set<URI> s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1"));
+      s.add(new URI("http://[ff01::114]/page2"));
+      linkMap.put("http://[ff01::114]", s);
+      linkMap.put("http://[ff01::114]/index.html", s);
+      s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1/foo"));
+      linkMap.put("http://[ff01::114]/page1", s);
+      s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1/foo/bar"));
+      linkMap.put("http://[ff01::114]/page1/foo", s);
+      s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/disallowed"));
+      linkMap.put("http://[ff01::114]/page2", s);
+
+      // Simulate a robots.txt file with comments and a few disallows
+      StringBuilder sb = new StringBuilder();
+      sb.append(
+          "# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n");
+      sb.append("User-agent: * # match all bots\n");
+      sb.append("Disallow:  # This is void\n");
+      sb.append("Disallow: /disallow # Disallow this path\n");
+      sb.append("Disallow: /nonexistentpath # Disallow this path\n");
+      this.robotsCache.put(
+          "[ff01::114]",
+          super.parseRobotsTxt(
+              new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))));
+    }
+
+    @Override
+    public PostTool.PageFetcherResult readPageFromUrl(URL u) {
+      PostTool.PageFetcherResult res = new PostTool.PageFetcherResult();
+      if (isDisallowedByRobots(u)) {
+        res.httpStatus = 403;
+        return res;
+      }
+      res.httpStatus = 200;
+      res.contentType = "text/html";
+      res.content = ByteBuffer.wrap(htmlMap.get(u.toString()).getBytes(StandardCharsets.UTF_8));
+      return res;
+    }
+
+    @Override
+    public Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, URL postUrl) {
+      Set<URI> s = linkMap.get(PostTool.normalizeUrlEnding(url.toString()));
+      if (s == null) {
+        s = new HashSet<>();
+      }
+      return s;
+    }
+  }
 }
diff --git a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
index b6c63148a4e..a40903c1f0a 100644
--- a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
@@ -20,22 +20,15 @@ import java.io.File;
 import java.lang.invoke.MethodHandles;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
 import org.apache.commons.cli.CommandLine;
 import org.apache.solr.cli.CreateCollectionTool;
 import org.apache.solr.cli.DeleteTool;
 import org.apache.solr.cli.HealthcheckTool;
+import org.apache.solr.cli.PostTool;
 import org.apache.solr.cli.SolrCLI;
 import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.request.StreamingUpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.util.ExternalPaths;
@@ -115,52 +108,36 @@ public class SolrCloudExampleTest extends AbstractFullDistribZkTestBase {
         invalidToolExitStatus,
         tool.runTool(cli));
 
-    // now index docs like bin/solr post would, but we can't use SimplePostTool because it uses
-    // System.exit when it encounters an error, which JUnit doesn't like ...
+    // now index docs ...
     log.info("Created collection, now posting example docs!");
     Path exampleDocsDir = Path.of(ExternalPaths.SOURCE_HOME, "example", "exampledocs");
     assertTrue(exampleDocsDir.toAbsolutePath() + " not found!", Files.isDirectory(exampleDocsDir));
 
-    List<Path> xmlFiles;
-    try (Stream<Path> stream = Files.walk(exampleDocsDir, 1)) {
-      xmlFiles =
-          stream
-              .filter(path -> path.getFileName().toString().endsWith(".xml"))
-              // don't rely on File.compareTo, it's behavior varies by OS
-              .sorted(Comparator.comparing(path -> path.getFileName().toString()))
-              // be explicit about the collection type because we will shuffle it later
-              .collect(Collectors.toCollection(ArrayList::new));
-    }
+    String[] argsForPost =
+        new String[] {
+          "--solr-update-url",
+          solrUrl + "/" + testCollectionName + "/update",
+          "-filetypes",
+          "xml",
+          exampleDocsDir.toAbsolutePath().toString()
+        };
 
-    // force a deterministic random ordering of the files so seeds reproduce regardless of
-    // platform/filesystem
-    Collections.shuffle(xmlFiles, new Random(random().nextLong()));
+    PostTool postTool = new PostTool();
+    CommandLine postCli =
+        SolrCLI.processCommandLineArgs(postTool.getName(), postTool.getOptions(), argsForPost);
+    postTool.runTool(postCli);
 
-    // if you add/remove example XML docs, you'll have to fix these expected values
-    int expectedXmlFileCount = 14;
     int expectedXmlDocCount = 32;
 
-    assertEquals(
-        "Unexpected # of example XML files in " + exampleDocsDir.toAbsolutePath(),
-        expectedXmlFileCount,
-        xmlFiles.size());
-
-    for (Path xml : xmlFiles) {
-      if (log.isInfoEnabled()) {
-        log.info("POSTing {}", xml.toAbsolutePath());
-      }
-      cloudClient.request(
-          new StreamingUpdateRequest("/update", xml, "application/xml"), testCollectionName);
-    }
-    cloudClient.commit(testCollectionName);
-
     int numFound = 0;
 
     // give the update a chance to take effect.
     for (int idx = 0; idx < 100; ++idx) {
       QueryResponse qr = cloudClient.query(testCollectionName, new SolrQuery("*:*"));
       numFound = (int) qr.getResults().getNumFound();
-      if (numFound == expectedXmlDocCount) break;
+      if (numFound == expectedXmlDocCount) {
+        break;
+      }
       Thread.sleep(100);
     }
     assertEquals("*:* found unexpected number of documents", expectedXmlDocCount, numFound);
diff --git a/solr/packaging/test/test_post.bats b/solr/packaging/test/test_post.bats
index 34f39cfad87..1dcb561afa8 100644
--- a/solr/packaging/test/test_post.bats
+++ b/solr/packaging/test/test_post.bats
@@ -78,7 +78,7 @@ teardown() {
   
   solr create_collection -c monitors_no_type -d _default
   
-  run solr post -url http://localhost:${SOLR_PORT}/solr/monitors_no_type/update -commit ${SOLR_TIP}/example/exampledocs/monitor.xml
+  run solr post -url http://localhost:${SOLR_PORT}/solr/monitors_no_type/update ${SOLR_TIP}/example/exampledocs/monitor.xml
 
   assert_output --partial '1 files indexed.'
   refute_output --partial 'ERROR'
@@ -87,7 +87,7 @@ teardown() {
   
   solr create_collection -c books_no_type -d _default
   
-  run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update -commit ${SOLR_TIP}/example/exampledocs/books.json
+  run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update ${SOLR_TIP}/example/exampledocs/books.json
 
   assert_output --partial '1 files indexed.'
   refute_output --partial 'ERROR'
@@ -96,7 +96,7 @@ teardown() {
   
   solr create_collection -c books_csv_no_type -d _default
   
-  run solr post -url http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update -commit ${SOLR_TIP}/example/exampledocs/books.csv
+  run solr post -url http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update ${SOLR_TIP}/example/exampledocs/books.csv
 
   assert_output --partial '1 files indexed.'
   refute_output --partial 'ERROR'
@@ -104,12 +104,22 @@ teardown() {
   assert_output --partial '"numFound":10'  
 }
 
+@test "crawling a directory as a dry-run" {
+  
+  # We filter to xml,json,and csv as we don't want to invoke the Extract handler, and are running it as a dry run
+  run solr post --dry-run -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/foobar/update -skipcommit ${SOLR_TIP}/example/exampledocs
+
+  assert_output --partial 'Dry run complete. 16 would have been indexed.' 
+  refute_output --partial '16 files indexed.'
+  refute_output --partial 'ERROR'
+}
+
 @test "crawling a directory" {
   
   solr create_collection -c mixed_content -d _default
   
   # We filter to xml,json,and csv as we don't want to invoke the Extract handler.
-  run solr post -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/mixed_content/update -commit ${SOLR_TIP}/example/exampledocs
+  run solr post -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/mixed_content/update ${SOLR_TIP}/example/exampledocs
 
   assert_output --partial '16 files indexed.'
   refute_output --partial 'ERROR'
@@ -129,7 +139,7 @@ teardown() {
     }
   }' "http://localhost:${SOLR_PORT}/solr/webcrawl/config"
   
-  run solr post -mode web -url http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 https://solr.apache.org
+  run solr post -mode web --solr-update-url http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 https://solr.apache.org
   assert_output --partial 'Entering crawl at level 0'
 }
 
@@ -152,7 +162,7 @@ teardown() {
   run solr create_collection -c test_args -d _default
   assert_output --partial "Created collection 'test_args'"
   
-  run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode args -type application/xml -out -commit "<delete><query>*:*</query></delete>"
+  run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode args -type application/xml -out "<delete><query>*:*</query></delete>"
   assert_output --partial '<int name="status">0</int>'
   
   # confirm default type