You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ep...@apache.org on 2024/02/19 17:10:35 UTC
(solr) 01/02: SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)
This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
commit 8a427ebc606d7967bcaaef30a9449bc0bf61b25b
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Mon Feb 19 11:46:47 2024 -0500
SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)
* Copied unit tests from SimplePostToolTest to PostToolTest
* add a --dry-run mode that simulates sending documents to Solr
* Missed a few more places with -commit not needed post SOLR-17147 being completed
* clean up our code base to have fewer warnings about code quality.
* Update SolrCloudExampleTest to use the PostTool instead of simulating it's usage.
* Make the long form of -url expliciting a --solr-update-url to be clear what it's for.
---
solr/CHANGES.txt | 3 +
.../src/java/org/apache/solr/cli/PostTool.java | 1166 +++++++++++++++++++-
.../java/org/apache/solr/cli/RunExampleTool.java | 30 +-
.../src/test/org/apache/solr/cli/PostToolTest.java | 220 +++-
.../apache/solr/cloud/SolrCloudExampleTest.java | 57 +-
solr/packaging/test/test_post.bats | 22 +-
6 files changed, 1408 insertions(+), 90 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index ab43eb599a7..54c96bc87bb 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -26,6 +26,9 @@ Improvements
* SOLR-17145: The INSTALLSHARDDATA API now includes a 'requestid' field when run asynchronously (Jason Gerlowski)
+* SOLR-17159: bin/solr post now has proper unit testing. Users can specify a --dry-run option to
+ simulate posting documents without sending them to Solr. (Eric Pugh)
+
Optimizations
---------------------
* SOLR-17144: Close searcherExecutor thread per core after 1 minute (Pierre Salagnac, Christine Poerschke)
diff --git a/solr/core/src/java/org/apache/solr/cli/PostTool.java b/solr/core/src/java/org/apache/solr/cli/PostTool.java
index de716c131cb..0e3bc6b77c1 100644
--- a/solr/core/src/java/org/apache/solr/cli/PostTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/PostTool.java
@@ -16,15 +16,144 @@
*/
package org.apache.solr.cli;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.ProtocolException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.security.GeneralSecurityException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TimeZone;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
+import org.apache.solr.client.api.util.SolrVersion;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.util.Utils;
+import org.apache.solr.util.RTimer;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
public class PostTool extends ToolBase {
+ public static final String DEFAULT_FILE_TYPES =
+ "xml,json,jsonl,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
+ static final String DATA_MODE_FILES = "files";
+ static final String DATA_MODE_ARGS = "args";
+ static final String DATA_MODE_STDIN = "stdin";
+ static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
+ static final String FORMAT_SOLR = "solr";
+ static final String DATA_MODE_WEB = "web";
+
+ private static final int DEFAULT_WEB_DELAY = 10;
+ private static final int MAX_WEB_DEPTH = 10;
+ public static final String DEFAULT_CONTENT_TYPE = "application/json";
+
+ // Input args
+ int recursive = 0;
+ int delay = 0;
+ String fileTypes = PostTool.DEFAULT_FILE_TYPES;
+ URL solrUpdateUrl;
+ String credentials;
+ OutputStream out = null;
+ String type;
+ String format;
+ String mode = DEFAULT_DATA_MODE;
+ boolean commit;
+ boolean optimize;
+ boolean dryRun; // Avoids actual network traffic to Solr
+
+ String[] args;
+
+ boolean auto = true;
+ private int currentDepth;
+
+ static HashMap<String, String> mimeMap;
+ FileFilter fileFilter;
+ // Backlog for crawling
+ List<LinkedHashSet<URI>> backlog = new ArrayList<>();
+ Set<URI> visited = new HashSet<>();
+
+ static final Set<String> DATA_MODES = new HashSet<>();
+
+ PostTool.PageFetcher pageFetcher = new PostTool.PageFetcher();
+
+ static {
+ DATA_MODES.add(DATA_MODE_FILES);
+ DATA_MODES.add(DATA_MODE_ARGS);
+ DATA_MODES.add(DATA_MODE_STDIN);
+ DATA_MODES.add(DATA_MODE_WEB);
+
+ mimeMap = new HashMap<>();
+ mimeMap.put("xml", "application/xml");
+ mimeMap.put("csv", "text/csv");
+ mimeMap.put("json", "application/json");
+ mimeMap.put("jsonl", "application/jsonl");
+ mimeMap.put("pdf", "application/pdf");
+ mimeMap.put("rtf", "text/rtf");
+ mimeMap.put("html", "text/html");
+ mimeMap.put("htm", "text/html");
+ mimeMap.put("doc", "application/msword");
+ mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ mimeMap.put("ppt", "application/vnd.ms-powerpoint");
+ mimeMap.put(
+ "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ mimeMap.put("xls", "application/vnd.ms-excel");
+ mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
+ mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
+ mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
+ mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
+ mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
+ mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
+ mimeMap.put("txt", "text/plain");
+ mimeMap.put("log", "text/plain");
+ }
+
public PostTool() {
this(CLIO.getOutStream());
}
@@ -43,9 +172,10 @@ public class PostTool extends ToolBase {
return List.of(
Option.builder("url")
.argName("url")
+ .longOpt("solr-update-url")
.hasArg()
.required(false)
- .desc("<base Solr update URL>")
+ .desc("Solr Update URL, the full url to the update handler, including the /update.")
.build(),
Option.builder("c")
.longOpt("name")
@@ -66,7 +196,8 @@ public class PostTool extends ToolBase {
.argName("mode")
.hasArg(true)
.required(false)
- .desc("Files crawls files, web crawls website. default: files.")
+ .desc(
+ "Files crawls files, web crawls website, args processes input args, and stdin reads a command from standard in. default: files.")
.build(),
Option.builder("recursive")
.argName("recursive")
@@ -85,13 +216,13 @@ public class PostTool extends ToolBase {
.argName("content-type")
.hasArg(true)
.required(false)
- .desc("default: application/json")
+ .desc("Specify a specific mimetype to use, such as application/json.")
.build(),
Option.builder("filetypes")
.argName("<type>[,<type>,...]")
.hasArg(true)
.required(false)
- .desc("default: " + SimplePostTool.DEFAULT_FILE_TYPES)
+ .desc("default: " + DEFAULT_FILE_TYPES)
.build(),
Option.builder("params")
.argName("<key>=<value>[&<key>=<value>...]")
@@ -107,6 +238,12 @@ public class PostTool extends ToolBase {
.required(false)
.desc(
"sends application/json content as Solr commands to /update instead of /update/json/docs.")
+ .build(),
+ Option.builder()
+ .longOpt("dry-run")
+ .required(false)
+ .desc(
+ "Performs a dry run of the posting process without actually sending documents to Solr. Only works with files mode.")
.build());
}
@@ -114,52 +251,1027 @@ public class PostTool extends ToolBase {
public void runImpl(CommandLine cli) throws Exception {
SolrCLI.raiseLogLevelUnlessVerbose(cli);
- URL solrUrl = null;
+ solrUpdateUrl = null;
if (cli.hasOption("url")) {
String url = cli.getOptionValue("url");
- solrUrl = new URL(url);
+ solrUpdateUrl = new URL(url);
} else if (cli.hasOption("c")) {
String url = SolrCLI.getDefaultSolrUrl() + "/solr/" + cli.getOptionValue("c") + "/update";
- solrUrl = new URL(url);
+ solrUpdateUrl = new URL(url);
} else {
throw new IllegalArgumentException(
"Must specify either -url or -c parameter to post documents.");
}
- String mode = SimplePostTool.DEFAULT_DATA_MODE;
if (cli.hasOption("mode")) {
mode = cli.getOptionValue("mode");
}
- boolean auto = true;
- String type = null;
+
+ if (cli.hasOption("dry-run")) {
+ dryRun = true;
+ }
+
if (cli.hasOption("type")) {
type = cli.getOptionValue("type");
+ // Turn off automatically looking up the mimetype in favour of what is passed in.
+ auto = false;
}
- String format =
- cli.hasOption("format")
- ? SimplePostTool.FORMAT_SOLR
- : ""; // i.e not solr formatted json commands
+ format = cli.hasOption("format") ? FORMAT_SOLR : ""; // i.e not solr formatted json commands
- String fileTypes = SimplePostTool.DEFAULT_FILE_TYPES;
if (cli.hasOption("filetypes")) {
fileTypes = cli.getOptionValue("filetypes");
}
- int defaultDelay = (mode.equals((SimplePostTool.DATA_MODE_WEB)) ? 10 : 0);
- int delay = Integer.parseInt(cli.getOptionValue("delay", String.valueOf(defaultDelay)));
- int recursive = Integer.parseInt(cli.getOptionValue("recursive", "1"));
+ int defaultDelay = (mode.equals((DATA_MODE_WEB)) ? 10 : 0);
+ delay = Integer.parseInt(cli.getOptionValue("delay", String.valueOf(defaultDelay)));
+ recursive = Integer.parseInt(cli.getOptionValue("recursive", "1"));
- OutputStream out = cli.hasOption("out") ? CLIO.getOutStream() : null;
- boolean commit = cli.hasOption("skipcommit") ? false : true;
- boolean optimize = cli.hasOption("optimize");
+ out = cli.hasOption("out") ? CLIO.getOutStream() : null;
+ commit = cli.hasOption("skipcommit") ? false : true;
+ optimize = cli.hasOption("optimize");
- String[] args = cli.getArgs();
+ args = cli.getArgs();
- SimplePostTool spt =
- new SimplePostTool(
- mode, solrUrl, auto, type, format, recursive, delay, fileTypes, out, commit, optimize,
- args);
+ execute();
+ }
+
+ /**
+ * After initialization, call execute to start the post job. This method delegates to the correct
+ * mode method.
+ */
+ public void execute() throws SolrServerException, IOException {
+ final RTimer timer = new RTimer();
+ if (PostTool.DATA_MODE_FILES.equals(mode)) {
+ doFilesMode();
+ } else if (DATA_MODE_ARGS.equals(mode)) {
+ doArgsMode(args);
+ } else if (PostTool.DATA_MODE_WEB.equals(mode)) {
+ doWebMode();
+ } else if (DATA_MODE_STDIN.equals(mode)) {
+ doStdinMode();
+ } else {
+ return;
+ }
+
+ if (commit) {
+ commit();
+ }
+ if (optimize) {
+ optimize();
+ }
+ displayTiming((long) timer.getTime());
+ }
+
+ private void doFilesMode() {
+ currentDepth = 0;
+
+ info(
+ "Posting files to [base] url "
+ + solrUpdateUrl
+ + (!auto ? " using content-type " + (type == null ? DEFAULT_CONTENT_TYPE : type) : "")
+ + "...");
+ if (auto) {
+ info("Entering auto mode. File endings considered are " + fileTypes);
+ }
+ if (recursive > 0) {
+ info("Entering recursive mode, max depth=" + recursive + ", delay=" + delay + "s");
+ }
+ fileFilter = getFileFilterFromFileTypes(fileTypes);
+ int numFilesPosted = postFiles(args, 0, out, type);
+ if (dryRun) {
+ info("Dry run complete. " + numFilesPosted + " would have been indexed.");
+ } else {
+ info(numFilesPosted + " files indexed.");
+ }
+ }
+
+ private void doArgsMode(String[] args) {
+ info("POSTing args to " + solrUpdateUrl + "...");
+ for (String a : args) {
+ postData(stringToStream(a), null, out, type, solrUpdateUrl);
+ }
+ }
+
+ private void doWebMode() {
+ reset();
+ int numPagesPosted = 0;
+ try {
+ if (type != null) {
+ throw new IllegalArgumentException(
+ "Specifying content-type with \"-Ddata=web\" is not supported");
+ }
+
+ // Set Extracting handler as default
+ solrUpdateUrl = appendUrlPath(solrUpdateUrl, "/extract");
+
+ info("Posting web pages to Solr url " + solrUpdateUrl);
+ auto = true;
+ info(
+ "Entering auto mode. Indexing pages with content-types corresponding to file endings "
+ + fileTypes);
+ if (recursive > 0) {
+ if (recursive > MAX_WEB_DEPTH) {
+ recursive = MAX_WEB_DEPTH;
+ warn("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "...");
+ }
+ if (delay < DEFAULT_WEB_DELAY) {
+ warn(
+ "Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
+ }
+ info("Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s");
+ }
+ numPagesPosted = postWebPages(args, 0, out);
+ info(numPagesPosted + " web pages indexed.");
+
+ } catch (MalformedURLException e) {
+ warn("Wrong URL trying to append /extract to " + solrUpdateUrl);
+ }
+ }
+
+ private void doStdinMode() {
+ info("POSTing stdin to " + solrUpdateUrl + "...");
+ postData(System.in, null, out, type, solrUpdateUrl);
+ }
+
+ private void reset() {
+ backlog = new ArrayList<>();
+ visited = new HashSet<>();
+ }
+
+ /**
+ * Pretty prints the number of milliseconds taken to post the content to Solr
+ *
+ * @param millis the time in milliseconds
+ */
+ private void displayTiming(long millis) {
+ SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault());
+ df.setTimeZone(TimeZone.getTimeZone("UTC"));
+ CLIO.out("Time spent: " + df.format(new Date(millis)));
+ }
+
+ private boolean checkIsValidPath(File srcFile) {
+ return Files.exists(srcFile.toPath());
+ }
+
+ /**
+ * Post all filenames provided in args
+ *
+ * @param args array of file names
+ * @param startIndexInArgs offset to start
+ * @param out output stream to post data to
+ * @param type default content-type to use when posting (may be overridden in auto mode)
+ * @return number of files posted
+ */
+ public int postFiles(String[] args, int startIndexInArgs, OutputStream out, String type) {
+ reset();
+ int filesPosted = 0;
+ for (int j = startIndexInArgs; j < args.length; j++) {
+ File srcFile = new File(args[j]);
+ filesPosted = getFilesPosted(out, type, srcFile);
+ }
+ return filesPosted;
+ }
+
+ private int getFilesPosted(final OutputStream out, final String type, final File srcFile) {
+ int filesPosted = 0;
+ boolean isValidPath = checkIsValidPath(srcFile);
+ if (isValidPath && srcFile.isDirectory() && srcFile.canRead()) {
+ filesPosted += postDirectory(srcFile, out, type);
+ } else if (isValidPath && srcFile.isFile() && srcFile.canRead()) {
+ filesPosted += postFiles(new File[] {srcFile}, out, type);
+ } else {
+ filesPosted += handleGlob(srcFile, out, type);
+ }
+ return filesPosted;
+ }
+
+ /**
+ * Posts a whole directory
+ *
+ * @return number of files posted total
+ */
+ private int postDirectory(File dir, OutputStream out, String type) {
+ if (dir.isHidden() && !dir.getName().equals(".")) {
+ return (0);
+ }
+ info(
+ "Indexing directory "
+ + dir.getPath()
+ + " ("
+ + dir.listFiles(fileFilter).length
+ + " files, depth="
+ + currentDepth
+ + ")");
+ int posted = 0;
+ posted += postFiles(dir.listFiles(fileFilter), out, type);
+ if (recursive > currentDepth) {
+ for (File d : dir.listFiles()) {
+ if (d.isDirectory()) {
+ currentDepth++;
+ posted += postDirectory(d, out, type);
+ currentDepth--;
+ }
+ }
+ }
+ return posted;
+ }
+
+ /**
+ * Posts a list of file names
+ *
+ * @return number of files posted
+ */
+ int postFiles(File[] files, OutputStream out, String type) {
+ int filesPosted = 0;
+ for (File srcFile : files) {
+ try {
+ if (!srcFile.isFile() || srcFile.isHidden()) {
+ continue;
+ }
+ postFile(srcFile, out, type);
+ Thread.sleep(delay * 1000L);
+ filesPosted++;
+ } catch (InterruptedException | MalformedURLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return filesPosted;
+ }
+
+ /**
+ * This only handles file globs not full path globbing.
+ *
+ * @param globFile file holding glob path
+ * @param out outputStream to write results to
+ * @param type default content-type to use when posting (may be overridden in auto mode)
+ * @return number of files posted
+ */
+ int handleGlob(File globFile, OutputStream out, String type) {
+ int filesPosted = 0;
+ File parent = globFile.getParentFile();
+ if (parent == null) {
+ parent = new File(".");
+ }
+ String fileGlob = globFile.getName();
+ PostTool.GlobFileFilter ff = new PostTool.GlobFileFilter(fileGlob, false);
+ File[] fileList = parent.listFiles(ff);
+ if (fileList == null || fileList.length == 0) {
+ warn("No files or directories matching " + globFile);
+ } else {
+ filesPosted = postFiles(fileList, out, type);
+ }
+ return filesPosted;
+ }
+
+ /**
+ * This method takes as input a list of start URL strings for crawling, converts the URL strings
+ * to URI strings and adds each one to a backlog and then starts crawling
+ *
+ * @param args the raw input args from main()
+ * @param startIndexInArgs offset for where to start
+ * @param out outputStream to write results to
+ * @return the number of web pages posted
+ */
+ public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
+ reset();
+ LinkedHashSet<URI> s = new LinkedHashSet<>();
+ for (int j = startIndexInArgs; j < args.length; j++) {
+ try {
+ URI uri = new URI(normalizeUrlEnding(args[j]));
+ s.add(uri);
+ } catch (URISyntaxException e) {
+ warn("Skipping malformed input URL: " + args[j]);
+ }
+ }
+ // Add URIs to level 0 of the backlog and start recursive crawling
+ backlog.add(s);
+ return webCrawl(0, out);
+ }
+
+ /**
+ * Normalizes a URL string by removing anchor part and trailing slash
+ *
+ * @return the normalized URL string
+ */
+ protected static String normalizeUrlEnding(String link) {
+ if (link.contains("#")) {
+ link = link.substring(0, link.indexOf('#'));
+ }
+ if (link.endsWith("?")) {
+ link = link.substring(0, link.length() - 1);
+ }
+ if (link.endsWith("/")) {
+ link = link.substring(0, link.length() - 1);
+ }
+ return link;
+ }
+
+ /**
+ * A very simple crawler, pulling URLs to fetch from a backlog and then recurses N levels deep if
+ * recursive>0. Links are parsed from HTML through first getting an XHTML version using
+ * SolrCell with extractOnly, and followed if they are local. The crawler pauses for a default
+ * delay of 10 seconds between each fetch, this can be configured in the delay variable. This is
+ * only meant for test purposes, as it does not respect robots or anything else fancy :)
+ *
+ * @param level which level to crawl
+ * @param out output stream to write to
+ * @return number of pages crawled on this level and below
+ */
+ protected int webCrawl(int level, OutputStream out) {
+ int numPages = 0;
+ LinkedHashSet<URI> stack = backlog.get(level);
+ int rawStackSize = stack.size();
+ stack.removeAll(visited);
+ int stackSize = stack.size();
+ LinkedHashSet<URI> subStack = new LinkedHashSet<>();
+ info(
+ "Entering crawl at level "
+ + level
+ + " ("
+ + rawStackSize
+ + " links total, "
+ + stackSize
+ + " new)");
+ for (URI uri : stack) {
+ try {
+ visited.add(uri);
+ URL url = uri.toURL();
+ PostTool.PageFetcherResult result = pageFetcher.readPageFromUrl(url);
+ if (result.httpStatus == 200) {
+ url = (result.redirectUrl != null) ? result.redirectUrl : url;
+ URL postUrl =
+ new URL(
+ appendParam(
+ solrUpdateUrl.toString(),
+ "literal.id="
+ + URLEncoder.encode(url.toString(), UTF_8)
+ + "&literal.url="
+ + URLEncoder.encode(url.toString(), UTF_8)));
+ ByteBuffer content = result.content;
+ boolean success =
+ postData(
+ new ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()),
+ null,
+ out,
+ result.contentType,
+ postUrl);
+ if (success) {
+ info("POSTed web resource " + url + " (depth: " + level + ")");
+ Thread.sleep(delay * 1000L);
+ numPages++;
+ // Pull links from HTML pages only
+ if (recursive > level && result.contentType.equals("text/html")) {
+ Set<URI> children =
+ pageFetcher.getLinksFromWebPage(
+ url,
+ new ByteArrayInputStream(
+ content.array(), content.arrayOffset(), content.limit()),
+ result.contentType,
+ postUrl);
+ subStack.addAll(children);
+ }
+ } else {
+ warn("An error occurred while posting " + uri);
+ }
+ } else {
+ warn("The URL " + uri + " returned a HTTP result status of " + result.httpStatus);
+ }
+ } catch (IOException | URISyntaxException e) {
+ warn("Caught exception when trying to open connection to " + uri + ": " + e.getMessage());
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ if (!subStack.isEmpty()) {
+ backlog.add(subStack);
+ numPages += webCrawl(level + 1, out);
+ }
+ return numPages;
+ }
+
+ /**
+ * Computes the full URL based on a base url and a possibly relative link found in the href param
+ * of an HTML anchor.
+ *
+ * @param baseUrl the base url from where the link was found
+ * @param link the absolute or relative link
+ * @return the string version of the full URL
+ */
+ protected String computeFullUrl(URL baseUrl, String link) {
+ if (link == null || link.length() == 0) {
+ return null;
+ }
+ if (!link.startsWith("http")) {
+ if (link.startsWith("/")) {
+ link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
+ } else {
+ if (link.contains(":")) {
+ return null; // Skip non-relative URLs
+ }
+ String path = baseUrl.getPath();
+ if (!path.endsWith("/")) {
+ int sep = path.lastIndexOf('/');
+ String file = path.substring(sep + 1);
+ if (file.contains(".") || file.contains("?")) {
+ path = path.substring(0, sep);
+ }
+ }
+ link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
+ }
+ }
+ link = normalizeUrlEnding(link);
+ String l = link.toLowerCase(Locale.ROOT);
+ // Simple brute force skip images
+ if (l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
+ return null; // Skip images
+ }
+ return link;
+ }
+
+ /**
+ * Uses the mime-type map to reverse lookup whether the file ending for our type is supported by
+ * the fileTypes option
+ *
+ * @param type what content-type to lookup
+ * @return true if this is a supported content type
+ */
+ protected boolean typeSupported(String type) {
+ for (Map.Entry<String, String> entry : mimeMap.entrySet()) {
+ if (entry.getValue().equals(type)) {
+ if (fileTypes.contains(entry.getKey())) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ static void warn(String msg) {
+ CLIO.err("PostTool: WARNING: " + msg);
+ }
+
+ static void info(String msg) {
+ CLIO.out(msg);
+ }
+
+ /** Does a simple commit operation */
+ public void commit() throws IOException, SolrServerException {
+ info("COMMITting Solr index changes to " + solrUpdateUrl + "...");
+ String url = solrUpdateUrl.toString();
+ url = url.substring(0, url.lastIndexOf("/update"));
+ try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+ client.commit();
+ }
+ }
+
+ /** Does a simple optimize operation */
+ public void optimize() throws IOException, SolrServerException {
+ info("Performing an OPTIMIZE to " + solrUpdateUrl + "...");
+ String url = solrUpdateUrl.toString();
+ url = url.substring(0, url.lastIndexOf("/update"));
+ try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+ client.optimize();
+ }
+ }
+
+ /**
+ * Appends a URL query parameter to a URL
+ *
+ * @param url the original URL
+ * @param param the parameter(s) to append, separated by "&"
+ * @return the string version of the resulting URL
+ */
+ public static String appendParam(String url, String param) {
+ String[] pa = param.split("&");
+ for (String p : pa) {
+ if (p.trim().length() == 0) {
+ continue;
+ }
+ String[] kv = p.split("=");
+ if (kv.length == 2) {
+ url = url + (url.contains("?") ? "&" : "?") + kv[0] + "=" + kv[1];
+ } else {
+ warn("Skipping param " + p + " which is not on form key=value");
+ }
+ }
+ return url;
+ }
+
+ /** Opens the file and posts its contents to the solrUrl, writes to response to output. */
+ public void postFile(File file, OutputStream output, String type) throws MalformedURLException {
+ InputStream is = null;
+
+ URL url = solrUpdateUrl;
+ String suffix = "";
+ if (auto) {
+ if (type == null) {
+ type = guessType(file);
+ }
+ // TODO: Add a flag that disables /update and sends all to /update/extract, to avoid CSV,
+ // JSON, and XML files
+ // TODO: from being interpreted as Solr documents internally
+ if (type.equals("application/json") && !PostTool.FORMAT_SOLR.equals(format)) {
+ suffix = "/json/docs";
+ String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString();
+ url = new URL(urlStr);
+ } else if (type.equals("application/xml")
+ || type.equals("text/csv")
+ || type.equals("application/json")) {
+ // Default handler
+ } else {
+ // SolrCell
+ suffix = "/extract";
+ String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString();
+ if (!urlStr.contains("resource.name")) {
+ urlStr =
+ appendParam(
+ urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), UTF_8));
+ }
+ if (!urlStr.contains("literal.id")) {
+ urlStr =
+ appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), UTF_8));
+ }
+ url = new URL(urlStr);
+ }
+ } else {
+ if (type == null) {
+ type = DEFAULT_CONTENT_TYPE;
+ }
+ }
+ if (dryRun) {
+ info(
+ "DRY RUN of POSTing file "
+ + file.getName()
+ + (auto ? " (" + type + ")" : "")
+ + " to [base]"
+ + suffix);
+ } else {
+ try {
+ info(
+ "POSTing file "
+ + file.getName()
+ + (auto ? " (" + type + ")" : "")
+ + " to [base]"
+ + suffix);
+ is = new FileInputStream(file);
+ postData(is, file.length(), output, type, url);
+ } catch (IOException e) {
+ warn("Can't open/read file: " + file);
+ } finally {
+ try {
+ if (is != null) {
+ is.close();
+ }
+ } catch (IOException e) {
+ warn("IOException while closing file: " + e);
+ }
+ }
+ }
+ }
+
+ /**
+ * Appends to the path of the URL
+ *
+ * @param url the URL
+ * @param append the path to append
+ * @return the final URL version
+ */
+ protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
+ return new URL(
+ url.getProtocol()
+ + "://"
+ + url.getAuthority()
+ + url.getPath()
+ + append
+ + (url.getQuery() != null ? "?" + url.getQuery() : ""));
+ }
+
+ /**
+ * Guesses the type of file, based on file name suffix Returns "application/octet-stream" if no
+ * corresponding mimeMap type.
+ *
+ * @param file the file
+ * @return the content-type guessed
+ */
+ protected static String guessType(File file) {
+ String name = file.getName();
+ String suffix = name.substring(name.lastIndexOf('.') + 1);
+ String type = mimeMap.get(suffix.toLowerCase(Locale.ROOT));
+ return (type != null) ? type : "application/octet-stream";
+ }
+
+ /**
+ * Reads data from the data stream and posts it to solr, writes to the response to output
+ *
+ * @return true if success
+ */
+ public boolean postData(
+ InputStream data, Long length, OutputStream output, String type, URL url) {
+ if (dryRun) {
+ return true;
+ }
+
+ boolean success = true;
+ if (type == null) {
+ type = DEFAULT_CONTENT_TYPE;
+ }
+ HttpURLConnection urlConnection = null;
+ try {
+ try {
+ urlConnection = (HttpURLConnection) url.openConnection();
+ try {
+ urlConnection.setRequestMethod("POST");
+ } catch (ProtocolException e) {
+ warn("Shouldn't happen: HttpURLConnection doesn't support POST??" + e);
+ }
+ urlConnection.setDoOutput(true);
+ urlConnection.setDoInput(true);
+ urlConnection.setUseCaches(false);
+ urlConnection.setAllowUserInteraction(false);
+ urlConnection.setRequestProperty("Content-type", type);
+ basicAuth(urlConnection);
+ if (null != length) {
+ urlConnection.setFixedLengthStreamingMode(length);
+ } else {
+ urlConnection.setChunkedStreamingMode(-1); // use JDK default chunkLen, 4k in Java 8.
+ }
+ urlConnection.connect();
+ } catch (IOException e) {
+ warn("Connection error (is Solr running at " + solrUpdateUrl + " ?): " + e);
+ success = false;
+ } catch (Exception e) {
+ warn("POST failed with error " + e.getMessage());
+ }
+
+ try (final OutputStream out = urlConnection.getOutputStream()) {
+ pipe(data, out);
+ } catch (IOException e) {
+ warn("IOException while posting data: " + e);
+ }
+
+ try {
+ success &= checkResponseCode(urlConnection);
+ try (final InputStream in = urlConnection.getInputStream()) {
+ pipe(in, output);
+ }
+ } catch (IOException e) {
+ warn("IOException while reading response: " + e);
+ success = false;
+ } catch (GeneralSecurityException e) {
+ warn(
+ "Looks like Solr is secured and would not let us in. Try with another user in '-u' parameter");
+ }
+ } finally {
+ if (urlConnection != null) {
+ urlConnection.disconnect();
+ }
+ }
+ return success;
+ }
+
+ private void basicAuth(HttpURLConnection urlc) throws Exception {
+ if (urlc.getURL().getUserInfo() != null) {
+ String encoding =
+ Base64.getEncoder().encodeToString(urlc.getURL().getUserInfo().getBytes(US_ASCII));
+ urlc.setRequestProperty("Authorization", "Basic " + encoding);
+ } else if (credentials != null) {
+ if (!credentials.contains(":")) {
+ throw new Exception("credentials '" + credentials + "' must be of format user:pass");
+ }
+ urlc.setRequestProperty(
+ "Authorization",
+ "Basic " + Base64.getEncoder().encodeToString(credentials.getBytes(UTF_8)));
+ }
+ }
+
+ private static boolean checkResponseCode(HttpURLConnection urlc)
+ throws IOException, GeneralSecurityException {
+ if (urlc.getResponseCode() >= 400) {
+ warn(
+ "Solr returned an error #"
+ + urlc.getResponseCode()
+ + " ("
+ + urlc.getResponseMessage()
+ + ") for url: "
+ + urlc.getURL());
+ Charset charset = StandardCharsets.ISO_8859_1;
+ final String contentType = urlc.getContentType();
+ // code cloned from ContentStreamBase, but post.jar should be standalone!
+ if (contentType != null) {
+ int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset=");
+ if (idx > 0) {
+ charset = Charset.forName(contentType.substring(idx + "charset=".length()).trim());
+ }
+ }
+ // Print the response returned by Solr
+ try (InputStream errStream = urlc.getErrorStream()) {
+ if (errStream != null) {
+ BufferedReader br = new BufferedReader(new InputStreamReader(errStream, charset));
+ final StringBuilder response = new StringBuilder("Response: ");
+ int ch;
+ while ((ch = br.read()) != -1) {
+ response.append((char) ch);
+ }
+ warn(response.toString().trim());
+ }
+ }
+ if (urlc.getResponseCode() == 401) {
+ throw new GeneralSecurityException(
+ "Solr requires authentication (response 401). Please try again with '-u' option");
+ }
+ if (urlc.getResponseCode() == 403) {
+ throw new GeneralSecurityException(
+ "You are not authorized to perform this action against Solr. (response 403)");
+ }
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Converts a string to an input stream
+ *
+ * @param s the string
+ * @return the input stream
+ */
+ public static InputStream stringToStream(String s) {
+ return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
+ }
+
+ /**
+ * Pipes everything from the source to the dest. If dest is null, then everything is read from
+ * source and thrown away.
+ */
+ private static void pipe(InputStream source, OutputStream dest) throws IOException {
+ byte[] buf = new byte[1024];
+ int read = 0;
+ while ((read = source.read(buf)) >= 0) {
+ if (null != dest) {
+ dest.write(buf, 0, read);
+ }
+ }
+ if (null != dest) {
+ dest.flush();
+ }
+ }
+
+ public FileFilter getFileFilterFromFileTypes(String fileTypes) {
+ String glob;
+ if (fileTypes.equals("*")) {
+ glob = ".*";
+ } else {
+ glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
+ }
+ return new PostTool.GlobFileFilter(glob, true);
+ }
+
+ //
+ // Utility methods for XPath handing
+ //
+
+ /** Gets all nodes matching an XPath */
+ public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
+ XPathFactory factory = XPathFactory.newInstance();
+ XPath xp = factory.newXPath();
+ XPathExpression expr = xp.compile(xpath);
+ return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
+ }
+
+ /**
+ * Gets the string content of the matching an XPath
+ *
+ * @param n the node (or doc)
+ * @param xpath the xpath string
+ * @param concatAll if true, text from all matching nodes will be concatenated, else only the
+ * first returned
+ */
+ public static String getXP(Node n, String xpath, boolean concatAll)
+ throws XPathExpressionException {
+ NodeList nodes = getNodesFromXP(n, xpath);
+ StringBuilder sb = new StringBuilder();
+ if (nodes.getLength() > 0) {
+ for (int i = 0; i < nodes.getLength(); i++) {
+ sb.append(nodes.item(i).getNodeValue()).append(' ');
+ if (!concatAll) {
+ break;
+ }
+ }
+ return sb.toString().trim();
+ } else return "";
+ }
+
+ /** Takes a string as input and returns a DOM */
+ public static Document makeDom(byte[] in)
+ throws SAXException, IOException, ParserConfigurationException {
+ InputStream is = new ByteArrayInputStream(in);
+ Document dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is);
+ return dom;
+ }
+
+ /** Inner class to filter files based on glob wildcards */
+ static class GlobFileFilter implements FileFilter {
+ private final Pattern p;
+
+ public GlobFileFilter(String pattern, boolean isRegex) {
+ String _pattern = pattern;
+ if (!isRegex) {
+ _pattern =
+ _pattern
+ .replace("^", "\\^")
+ .replace("$", "\\$")
+ .replace(".", "\\.")
+ .replace("(", "\\(")
+ .replace(")", "\\)")
+ .replace("+", "\\+")
+ .replace("*", ".*")
+ .replace("?", ".");
+ _pattern = "^" + _pattern + "$";
+ }
+
+ try {
+ p = Pattern.compile(_pattern, Pattern.CASE_INSENSITIVE);
+ } catch (PatternSyntaxException e) {
+ throw new IllegalArgumentException(
+ "Invalid type list " + pattern + ". " + e.getDescription());
+ }
+ }
+
+ @Override
+ public boolean accept(File file) {
+ return p.matcher(file.getName()).find();
+ }
+ }
+
+ //
+ // Simple crawler class which can fetch a page and check for robots.txt
+ //
+ class PageFetcher {
+ Map<String, List<String>> robotsCache;
+ static final String DISALLOW = "Disallow:";
+
+ public PageFetcher() {
+ robotsCache = new HashMap<>();
+ }
+
+ public PageFetcherResult readPageFromUrl(URL u) throws URISyntaxException {
+ PostTool.PageFetcherResult res = new PostTool.PageFetcherResult();
+ try {
+ if (isDisallowedByRobots(u)) {
+ warn("The URL " + u + " is disallowed by robots.txt and will not be crawled.");
+ res.httpStatus = 403;
+ URI uri = u.toURI();
+ visited.add(uri);
+ return res;
+ }
+ res.httpStatus = 404;
+ HttpURLConnection conn = (HttpURLConnection) u.openConnection();
+ conn.setRequestProperty(
+ "User-Agent",
+ "PostTool-crawler/" + SolrVersion.LATEST_STRING + " (https://solr.apache.org/)");
+ conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+ conn.connect();
+ res.httpStatus = conn.getResponseCode();
+ if (!normalizeUrlEnding(conn.getURL().toString())
+ .equals(normalizeUrlEnding(u.toString()))) {
+ info("The URL " + u + " caused a redirect to " + conn.getURL());
+ u = conn.getURL();
+ res.redirectUrl = u;
+ URI uri = u.toURI();
+ visited.add(uri);
+ }
+ if (res.httpStatus == 200) {
+ // Raw content type of form "text/html; encoding=utf-8"
+ String rawContentType = conn.getContentType();
+ String type = rawContentType.split(";")[0];
+ if (typeSupported(type) || "*".equals(fileTypes)) {
+ String encoding = conn.getContentEncoding();
+ InputStream is;
+ if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+ is = new GZIPInputStream(conn.getInputStream());
+ } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
+ is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
+ } else {
+ is = conn.getInputStream();
+ }
+
+ // Read into memory, so that we later can pull links from the page without re-fetching
+ res.content = Utils.toByteArray(is);
+ is.close();
+ } else {
+ warn("Skipping URL with unsupported type " + type);
+ res.httpStatus = 415;
+ }
+ }
+ } catch (IOException e) {
+ warn("IOException when reading page from url " + u + ": " + e.getMessage());
+ }
+ return res;
+ }
+
+ public boolean isDisallowedByRobots(URL url) {
+ String host = url.getHost();
+ String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
+ List<String> disallows = robotsCache.get(host);
+ if (disallows == null) {
+ disallows = new ArrayList<>();
+ URL urlRobot;
+ try {
+ urlRobot = new URL(strRobot);
+ disallows = parseRobotsTxt(urlRobot.openStream());
+ } catch (MalformedURLException e) {
+ return true; // We cannot trust this robots URL, should not happen
+ } catch (IOException e) {
+ // There is no robots.txt, will cache an empty disallow list
+ }
+ }
+
+ robotsCache.put(host, disallows);
+
+ String strURL = url.getFile();
+ for (String path : disallows) {
+ if (path.equals("/") || strURL.indexOf(path) == 0) return true;
+ }
+ return false;
+ }
+
+ /**
+ * Very simple robots.txt parser which obeys all Disallow lines regardless of user agent or
+ * whether there are valid Allow: lines.
+ *
+ * @param is Input stream of the robots.txt file
+ * @return a list of disallow paths
+ * @throws IOException if problems reading the stream
+ */
+ protected List<String> parseRobotsTxt(InputStream is) throws IOException {
+ List<String> disallows = new ArrayList<>();
+ BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
+ String l;
+ while ((l = r.readLine()) != null) {
+ String[] arr = l.split("#");
+ if (arr.length == 0) continue;
+ l = arr[0].trim();
+ if (l.startsWith(DISALLOW)) {
+ l = l.substring(DISALLOW.length()).trim();
+ if (l.length() == 0) continue;
+ disallows.add(l);
+ }
+ }
+ is.close();
+ return disallows;
+ }
+
+ /**
+ * Finds links on a web page, using /extract?extractOnly=true
+ *
+ * @param url the URL of the web page
+ * @param is the input stream of the page
+ * @param type the content-type
+ * @param postUrl the URL (typically /solr/extract) in order to pull out links
+ * @return a set of URIs parsed from the page
+ */
+ protected Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, URL postUrl) {
+ Set<URI> linksFromPage = new HashSet<>();
+
+ try {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
+ extractUrl = new URL(appendParam(extractUrl.toString(), "wt=xml"));
+ boolean success = postData(is, null, os, type, extractUrl);
+ if (success) {
+ Document d = makeDom(os.toByteArray());
+ String innerXml = getXP(d, "/response/str/text()[1]", false);
+ d = makeDom(innerXml.getBytes(StandardCharsets.UTF_8));
+ NodeList links = getNodesFromXP(d, "/html/body//a/@href");
+ for (int i = 0; i < links.getLength(); i++) {
+ String link = links.item(i).getTextContent();
+ link = computeFullUrl(url, link);
+ if (link == null) {
+ continue;
+ }
+ URI newUri = new URI(link);
+ if (newUri.getAuthority() == null
+ || !newUri.getAuthority().equals(url.getAuthority())) {
+ linksFromPage.add(newUri);
+ }
+ }
+ }
+ } catch (MalformedURLException e) {
+ warn("Malformed URL " + url);
+ } catch (IOException e) {
+ warn("IOException opening URL " + url + ": " + e.getMessage());
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ return linksFromPage;
+ }
+ }
- spt.execute();
+ /** Utility class to hold the result form a page fetch */
+ public static class PageFetcherResult {
+ int httpStatus = 200;
+ String contentType = "text/html";
+ URL redirectUrl = null;
+ ByteBuffer content;
}
}
diff --git a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
index 908990db22d..680a879372e 100644
--- a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
@@ -315,22 +315,20 @@ public class RunExampleTool extends ToolBase {
String updateUrl = String.format(Locale.ROOT, "%s/%s/update", solrUrl, collectionName);
echo("Indexing tech product example docs from " + exampledocsDir.getAbsolutePath());
- String currentPropVal = System.getProperty("url");
- System.setProperty("url", updateUrl);
- String currentTypeVal = System.getProperty("type");
- // We assume that example docs are always in XML.
- System.setProperty("type", "application/xml");
- SimplePostTool.main(new String[] {exampledocsDir.getAbsolutePath() + "/*.xml"});
- if (currentPropVal != null) {
- System.setProperty("url", currentPropVal); // reset
- } else {
- System.clearProperty("url");
- }
- if (currentTypeVal != null) {
- System.setProperty("type", currentTypeVal); // reset
- } else {
- System.clearProperty("type");
- }
+ String[] args =
+ new String[] {
+ "post",
+ "-url",
+ updateUrl,
+ "-type",
+ "application/xml",
+ exampledocsDir.getAbsolutePath() + "/*.xml"
+ };
+ PostTool postTool = new PostTool();
+ CommandLine postToolCli =
+ SolrCLI.parseCmdLine(postTool.getName(), args, postTool.getOptions());
+ postTool.runTool(postToolCli);
+
} else {
echo(
"exampledocs directory not found, skipping indexing step for the techproducts example");
diff --git a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
index 88639c7cefc..e11c11884f1 100644
--- a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
+++ b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
@@ -20,10 +20,22 @@ package org.apache.solr.cli;
import static org.apache.solr.cli.SolrCLI.findTool;
import static org.apache.solr.cli.SolrCLI.parseCmdLine;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.Map;
+import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -33,6 +45,11 @@ import org.apache.solr.common.util.Utils;
import org.junit.BeforeClass;
import org.junit.Test;
+/**
+ * NOTE: do *not* use real hostnames, not even "example.com", in the webcrawler tests.
+ *
+ * <p>A MockPageFetcher is used to prevent real HTTP requests from being executed.
+ */
@SolrTestCaseJ4.SuppressSSL
public class PostToolTest extends SolrCloudTestCase {
@@ -58,7 +75,7 @@ public class PostToolTest extends SolrCloudTestCase {
String[] args = {
"post",
- "-url",
+ "--solr-update-url",
cluster.getJettySolrRunner(0).getBaseUrl() + "/" + collection + "/update",
jsonDoc.getAbsolutePath()
};
@@ -90,4 +107,205 @@ public class PostToolTest extends SolrCloudTestCase {
CommandLine cli = parseCmdLine(tool.getName(), args, tool.getOptions());
return tool.runTool(cli);
}
+
+ @Test
+ public void testNormalizeUrlEnding() {
+ assertEquals("http://[ff01::114]", PostTool.normalizeUrlEnding("http://[ff01::114]/"));
+ assertEquals(
+ "http://[ff01::114]", PostTool.normalizeUrlEnding("http://[ff01::114]/#foo?bar=baz"));
+ assertEquals(
+ "http://[ff01::114]/index.html",
+ PostTool.normalizeUrlEnding("http://[ff01::114]/index.html#hello"));
+ }
+
+ @Test
+ public void testComputeFullUrl() throws IOException {
+
+ PostTool webPostTool = new PostTool();
+
+ assertEquals(
+ "http://[ff01::114]/index.html",
+ webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "/index.html"));
+ assertEquals(
+ "http://[ff01::114]/index.html",
+ webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo/bar/"), "/index.html"));
+ assertEquals(
+ "http://[ff01::114]/fil.html",
+ webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo.htm?baz#hello"), "fil.html"));
+ // TODO: How to know what is the base if URL path ends with "foo"??
+ // assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new
+ // URL("http://[ff01::114]/foo?baz#hello"), "fil.html"));
+ assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "fil.jpg"));
+ assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "mailto:hello@foo.bar"));
+ assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "ftp://server/file"));
+ }
+
+ @Test
+ public void testTypeSupported() {
+ PostTool postTool = new PostTool();
+
+ assertTrue(postTool.typeSupported("application/pdf"));
+ assertTrue(postTool.typeSupported("application/xml"));
+ assertFalse(postTool.typeSupported("text/foo"));
+
+ postTool.fileTypes = "doc,xls,ppt";
+ postTool.fileFilter = postTool.getFileFilterFromFileTypes(postTool.fileTypes);
+ assertFalse(postTool.typeSupported("application/pdf"));
+ assertTrue(postTool.typeSupported("application/msword"));
+ }
+
+ @Test
+ public void testAppendParam() {
+ assertEquals(
+ "http://[ff01::114]?foo=bar", PostTool.appendParam("http://[ff01::114]", "foo=bar"));
+ assertEquals(
+ "http://[ff01::114]/?a=b&foo=bar",
+ PostTool.appendParam("http://[ff01::114]/?a=b", "foo=bar"));
+ }
+
+ @Test
+ public void testAppendUrlPath() throws MalformedURLException {
+ assertEquals(
+ new URL("http://[ff01::114]/a?foo=bar"),
+ PostTool.appendUrlPath(new URL("http://[ff01::114]?foo=bar"), "/a"));
+ }
+
+ @Test
+ public void testGuessType() {
+ File f = new File("foo.doc");
+ assertEquals("application/msword", PostTool.guessType(f));
+ f = new File("foobar");
+ assertEquals("application/octet-stream", PostTool.guessType(f));
+ f = new File("foo.json");
+ assertEquals("application/json", PostTool.guessType(f));
+ }
+
+ @Test
+ public void testDoFilesMode() throws MalformedURLException {
+ PostTool postTool = new PostTool();
+ postTool.recursive = 0;
+ postTool.dryRun = true;
+ postTool.solrUpdateUrl = new URL("http://localhost:8983/solr/fake/update");
+ File dir = getFile("exampledocs");
+ int num = postTool.postFiles(new String[] {dir.toString()}, 0, null, null);
+ assertEquals(2, num);
+ }
+
+ @Test
+ public void testDoWebMode() throws IOException, URISyntaxException {
+ PostTool postTool = new PostTool();
+ postTool.pageFetcher = new MockPageFetcher();
+ postTool.dryRun = true;
+ postTool.solrUpdateUrl = new URL("http://user:password@localhost:5150/solr/fake/update");
+
+ // Uses mock pageFetcher
+ postTool.delay = 0;
+ postTool.recursive = 5;
+ int num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null);
+ assertEquals(5, num);
+
+ postTool.recursive = 1;
+ num = postTool.postWebPages(new String[] {"http://[ff01::114]/"}, 0, null);
+ assertEquals(3, num);
+
+ // Without respecting robots.txt
+ postTool.pageFetcher.robotsCache.put("[ff01::114]", Collections.emptyList());
+ postTool.recursive = 5;
+ num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null);
+ assertEquals(6, num);
+ }
+
+ @Test
+ public void testRobotsExclusion() throws IOException, URISyntaxException {
+ PostTool postTool = new PostTool();
+ postTool.pageFetcher = new MockPageFetcher();
+ postTool.dryRun = true;
+
+ assertFalse(postTool.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/")));
+ assertTrue(postTool.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/disallowed")));
+ assertEquals(
+ "There should be two entries parsed from robots.txt",
+ 2,
+ postTool.pageFetcher.robotsCache.get("[ff01::114]").size());
+ }
+
+ static class MockPageFetcher extends PostTool.PageFetcher {
+ HashMap<String, String> htmlMap = new HashMap<>();
+ HashMap<String, Set<URI>> linkMap = new HashMap<>();
+
+ public MockPageFetcher() throws IOException, URISyntaxException {
+ (new PostTool()).super();
+ htmlMap.put(
+ "http://[ff01::114]",
+ "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>");
+ htmlMap.put(
+ "http://[ff01::114]/index.html",
+ "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>");
+ htmlMap.put(
+ "http://[ff01::114]/page1",
+ "<html><body><a href=\"http://[ff01::114]/page1/foo\"></body></html>");
+ htmlMap.put(
+ "http://[ff01::114]/page1/foo",
+ "<html><body><a href=\"http://[ff01::114]/page1/foo/bar\"></body></html>");
+ htmlMap.put(
+ "http://[ff01::114]/page1/foo/bar",
+ "<html><body><a href=\"http://[ff01::114]/page1\"></body></html>");
+ htmlMap.put(
+ "http://[ff01::114]/page2",
+ "<html><body><a href=\"http://[ff01::114]/\"><a href=\"http://[ff01::114]/disallowed\"/></body></html>");
+ htmlMap.put(
+ "http://[ff01::114]/disallowed",
+ "<html><body><a href=\"http://[ff01::114]/\"></body></html>");
+
+ Set<URI> s = new HashSet<>();
+ s.add(new URI("http://[ff01::114]/page1"));
+ s.add(new URI("http://[ff01::114]/page2"));
+ linkMap.put("http://[ff01::114]", s);
+ linkMap.put("http://[ff01::114]/index.html", s);
+ s = new HashSet<>();
+ s.add(new URI("http://[ff01::114]/page1/foo"));
+ linkMap.put("http://[ff01::114]/page1", s);
+ s = new HashSet<>();
+ s.add(new URI("http://[ff01::114]/page1/foo/bar"));
+ linkMap.put("http://[ff01::114]/page1/foo", s);
+ s = new HashSet<>();
+ s.add(new URI("http://[ff01::114]/disallowed"));
+ linkMap.put("http://[ff01::114]/page2", s);
+
+ // Simulate a robots.txt file with comments and a few disallows
+ StringBuilder sb = new StringBuilder();
+ sb.append(
+ "# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n");
+ sb.append("User-agent: * # match all bots\n");
+ sb.append("Disallow: # This is void\n");
+ sb.append("Disallow: /disallow # Disallow this path\n");
+ sb.append("Disallow: /nonexistentpath # Disallow this path\n");
+ this.robotsCache.put(
+ "[ff01::114]",
+ super.parseRobotsTxt(
+ new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))));
+ }
+
+ @Override
+ public PostTool.PageFetcherResult readPageFromUrl(URL u) {
+ PostTool.PageFetcherResult res = new PostTool.PageFetcherResult();
+ if (isDisallowedByRobots(u)) {
+ res.httpStatus = 403;
+ return res;
+ }
+ res.httpStatus = 200;
+ res.contentType = "text/html";
+ res.content = ByteBuffer.wrap(htmlMap.get(u.toString()).getBytes(StandardCharsets.UTF_8));
+ return res;
+ }
+
+ @Override
+ public Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, URL postUrl) {
+ Set<URI> s = linkMap.get(PostTool.normalizeUrlEnding(url.toString()));
+ if (s == null) {
+ s = new HashSet<>();
+ }
+ return s;
+ }
+ }
}
diff --git a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
index b6c63148a4e..a40903c1f0a 100644
--- a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
@@ -20,22 +20,15 @@ import java.io.File;
import java.lang.invoke.MethodHandles;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Random;
import java.util.Set;
import java.util.concurrent.TimeUnit;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
import org.apache.commons.cli.CommandLine;
import org.apache.solr.cli.CreateCollectionTool;
import org.apache.solr.cli.DeleteTool;
import org.apache.solr.cli.HealthcheckTool;
+import org.apache.solr.cli.PostTool;
import org.apache.solr.cli.SolrCLI;
import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.request.StreamingUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.util.ExternalPaths;
@@ -115,52 +108,36 @@ public class SolrCloudExampleTest extends AbstractFullDistribZkTestBase {
invalidToolExitStatus,
tool.runTool(cli));
- // now index docs like bin/solr post would, but we can't use SimplePostTool because it uses
- // System.exit when it encounters an error, which JUnit doesn't like ...
+ // now index docs ...
log.info("Created collection, now posting example docs!");
Path exampleDocsDir = Path.of(ExternalPaths.SOURCE_HOME, "example", "exampledocs");
assertTrue(exampleDocsDir.toAbsolutePath() + " not found!", Files.isDirectory(exampleDocsDir));
- List<Path> xmlFiles;
- try (Stream<Path> stream = Files.walk(exampleDocsDir, 1)) {
- xmlFiles =
- stream
- .filter(path -> path.getFileName().toString().endsWith(".xml"))
- // don't rely on File.compareTo, it's behavior varies by OS
- .sorted(Comparator.comparing(path -> path.getFileName().toString()))
- // be explicit about the collection type because we will shuffle it later
- .collect(Collectors.toCollection(ArrayList::new));
- }
+ String[] argsForPost =
+ new String[] {
+ "--solr-update-url",
+ solrUrl + "/" + testCollectionName + "/update",
+ "-filetypes",
+ "xml",
+ exampleDocsDir.toAbsolutePath().toString()
+ };
- // force a deterministic random ordering of the files so seeds reproduce regardless of
- // platform/filesystem
- Collections.shuffle(xmlFiles, new Random(random().nextLong()));
+ PostTool postTool = new PostTool();
+ CommandLine postCli =
+ SolrCLI.processCommandLineArgs(postTool.getName(), postTool.getOptions(), argsForPost);
+ postTool.runTool(postCli);
- // if you add/remove example XML docs, you'll have to fix these expected values
- int expectedXmlFileCount = 14;
int expectedXmlDocCount = 32;
- assertEquals(
- "Unexpected # of example XML files in " + exampleDocsDir.toAbsolutePath(),
- expectedXmlFileCount,
- xmlFiles.size());
-
- for (Path xml : xmlFiles) {
- if (log.isInfoEnabled()) {
- log.info("POSTing {}", xml.toAbsolutePath());
- }
- cloudClient.request(
- new StreamingUpdateRequest("/update", xml, "application/xml"), testCollectionName);
- }
- cloudClient.commit(testCollectionName);
-
int numFound = 0;
// give the update a chance to take effect.
for (int idx = 0; idx < 100; ++idx) {
QueryResponse qr = cloudClient.query(testCollectionName, new SolrQuery("*:*"));
numFound = (int) qr.getResults().getNumFound();
- if (numFound == expectedXmlDocCount) break;
+ if (numFound == expectedXmlDocCount) {
+ break;
+ }
Thread.sleep(100);
}
assertEquals("*:* found unexpected number of documents", expectedXmlDocCount, numFound);
diff --git a/solr/packaging/test/test_post.bats b/solr/packaging/test/test_post.bats
index 34f39cfad87..1dcb561afa8 100644
--- a/solr/packaging/test/test_post.bats
+++ b/solr/packaging/test/test_post.bats
@@ -78,7 +78,7 @@ teardown() {
solr create_collection -c monitors_no_type -d _default
- run solr post -url http://localhost:${SOLR_PORT}/solr/monitors_no_type/update -commit ${SOLR_TIP}/example/exampledocs/monitor.xml
+ run solr post -url http://localhost:${SOLR_PORT}/solr/monitors_no_type/update ${SOLR_TIP}/example/exampledocs/monitor.xml
assert_output --partial '1 files indexed.'
refute_output --partial 'ERROR'
@@ -87,7 +87,7 @@ teardown() {
solr create_collection -c books_no_type -d _default
- run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update -commit ${SOLR_TIP}/example/exampledocs/books.json
+ run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update ${SOLR_TIP}/example/exampledocs/books.json
assert_output --partial '1 files indexed.'
refute_output --partial 'ERROR'
@@ -96,7 +96,7 @@ teardown() {
solr create_collection -c books_csv_no_type -d _default
- run solr post -url http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update -commit ${SOLR_TIP}/example/exampledocs/books.csv
+ run solr post -url http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update ${SOLR_TIP}/example/exampledocs/books.csv
assert_output --partial '1 files indexed.'
refute_output --partial 'ERROR'
@@ -104,12 +104,22 @@ teardown() {
assert_output --partial '"numFound":10'
}
+@test "crawling a directory as a dry-run" {
+
+ # We filter to xml,json,and csv as we don't want to invoke the Extract handler, and are running it as a dry run
+ run solr post --dry-run -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/foobar/update -skipcommit ${SOLR_TIP}/example/exampledocs
+
+ assert_output --partial 'Dry run complete. 16 would have been indexed.'
+ refute_output --partial '16 files indexed.'
+ refute_output --partial 'ERROR'
+}
+
@test "crawling a directory" {
solr create_collection -c mixed_content -d _default
# We filter to xml,json,and csv as we don't want to invoke the Extract handler.
- run solr post -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/mixed_content/update -commit ${SOLR_TIP}/example/exampledocs
+ run solr post -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/mixed_content/update ${SOLR_TIP}/example/exampledocs
assert_output --partial '16 files indexed.'
refute_output --partial 'ERROR'
@@ -129,7 +139,7 @@ teardown() {
}
}' "http://localhost:${SOLR_PORT}/solr/webcrawl/config"
- run solr post -mode web -url http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 https://solr.apache.org
+ run solr post -mode web --solr-update-url http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 https://solr.apache.org
assert_output --partial 'Entering crawl at level 0'
}
@@ -152,7 +162,7 @@ teardown() {
run solr create_collection -c test_args -d _default
assert_output --partial "Created collection 'test_args'"
- run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode args -type application/xml -out -commit "<delete><query>*:*</query></delete>"
+ run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode args -type application/xml -out "<delete><query>*:*</query></delete>"
assert_output --partial '<int name="status">0</int>'
# confirm default type