You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/08/19 11:35:31 UTC
svn commit: r1374718 [3/3] - in /lucene/dev/branches/lucene3312: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/solr/contrib/extraction/
lucene/ lucene/analysis/ lucene/analysis/common/
lucene/analysis/common/src/java/org/apache/lucene/analysis/charfi...
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/SnapPuller.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/SnapPuller.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/SnapPuller.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/SnapPuller.java Sun Aug 19 09:35:25 2012
@@ -384,7 +384,7 @@ public class SnapPuller {
// may be closed
core.getDirectoryFactory().doneWithDirectory(oldDirectory);
}
- doCommit();
+ doCommit(isFullCopyNeeded);
}
replicationStartTime = 0;
@@ -533,11 +533,11 @@ public class SnapPuller {
return sb;
}
- private void doCommit() throws IOException {
+ private void doCommit(boolean isFullCopyNeeded) throws IOException {
SolrQueryRequest req = new LocalSolrQueryRequest(solrCore,
new ModifiableSolrParams());
// reboot the writer on the new index and get a new searcher
- solrCore.getUpdateHandler().newIndexWriter(true);
+ solrCore.getUpdateHandler().newIndexWriter(isFullCopyNeeded);
try {
// first try to open an NRT searcher so that the new
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java Sun Aug 19 09:35:25 2012
@@ -182,8 +182,8 @@ abstract class AbstractStatsValues<T> im
for (Map.Entry<String, StatsValues> e2 : entry.getValue().entrySet()) {
nl2.add(e2.getKey(), e2.getValue().getStatsValues());
}
- res.add(FACETS, nl);
}
+ res.add(FACETS, nl);
return res;
}
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java Sun Aug 19 09:35:25 2012
@@ -74,8 +74,7 @@ public final class DefaultSolrCoreState
}
if (indexWriter == null) {
- indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2",
- false, false);
+ indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2", false);
}
if (refCntWriter == null) {
refCntWriter = new RefCounted<IndexWriter>(indexWriter) {
@@ -110,18 +109,28 @@ public final class DefaultSolrCoreState
writerPauseLock.wait();
} catch (InterruptedException e) {}
}
-
+
try {
if (indexWriter != null) {
- try {
- log.info("Closing old IndexWriter... core=" + coreName);
- indexWriter.close();
- } catch (Throwable t) {
- SolrException.log(log, "Error closing old IndexWriter. core=" + coreName, t);
+ if (!rollback) {
+ try {
+ log.info("Closing old IndexWriter... core=" + coreName);
+ indexWriter.close();
+ } catch (Throwable t) {
+ SolrException.log(log, "Error closing old IndexWriter. core="
+ + coreName, t);
+ }
+ } else {
+ try {
+ log.info("Rollback old IndexWriter... core=" + coreName);
+ indexWriter.rollback();
+ } catch (Throwable t) {
+ SolrException.log(log, "Error rolling back old IndexWriter. core="
+ + coreName, t);
+ }
}
}
- indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2",
- false, true);
+ indexWriter = createMainIndexWriter(core, "DirectUpdateHandler2", true);
log.info("New IndexWriter is ready to be used.");
// we need to null this so it picks up the new writer next get call
refCntWriter = null;
@@ -174,14 +183,12 @@ public final class DefaultSolrCoreState
@Override
public synchronized void rollbackIndexWriter(SolrCore core) throws IOException {
- indexWriter.rollback();
newIndexWriter(core, true);
}
- protected SolrIndexWriter createMainIndexWriter(SolrCore core, String name,
- boolean removeAllExisting, boolean forceNewDirectory) throws IOException {
+ protected SolrIndexWriter createMainIndexWriter(SolrCore core, String name, boolean forceNewDirectory) throws IOException {
return new SolrIndexWriter(name, core.getNewIndexDir(),
- core.getDirectoryFactory(), removeAllExisting, core.getSchema(),
+ core.getDirectoryFactory(), false, core.getSchema(),
core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), forceNewDirectory);
}
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java Sun Aug 19 09:35:25 2012
@@ -141,6 +141,8 @@ public class SolrIndexWriter extends Ind
super.rollback();
} finally {
isClosed = true;
+ directoryFactory.release(getDirectory());
+ numCloses.incrementAndGet();
}
}
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/TransactionLog.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/TransactionLog.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/TransactionLog.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/update/TransactionLog.java Sun Aug 19 09:35:25 2012
@@ -775,31 +775,3 @@ class ChannelFastInputStream extends Fas
}
-class MemOutputStream extends FastOutputStream {
- public List<byte[]> buffers = new LinkedList<byte[]>();
- public MemOutputStream(byte[] tempBuffer) {
- super(null, tempBuffer, 0);
- }
-
- @Override
- public void flush(byte[] arr, int offset, int len) throws IOException {
- if (arr == buf && offset==0 && len==buf.length) {
- buffers.add(buf); // steal the buffer
- buf = new byte[8192];
- } else if (len > 0) {
- byte[] newBuf = new byte[len];
- System.arraycopy(arr, offset, newBuf, 0, len);
- buffers.add(newBuf);
- }
- }
-
- public void writeAll(FastOutputStream fos) throws IOException {
- for (byte[] buffer : buffers) {
- fos.write(buffer);
- }
- if (pos > 0) {
- fos.write(buf, 0, pos);
- }
- }
-}
-
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/FastWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/FastWriter.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/FastWriter.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/FastWriter.java Sun Aug 19 09:35:25 2012
@@ -28,7 +28,7 @@ public class FastWriter extends Writer {
// it won't cause double buffering.
private static final int BUFSIZE = 8192;
protected final Writer sink;
- protected final char[] buf;
+ protected char[] buf;
protected int pos;
public FastWriter(Writer w) {
@@ -69,42 +69,64 @@ public class FastWriter extends Writer {
}
@Override
- public void write(char cbuf[], int off, int len) throws IOException {
- int space = buf.length - pos;
- if (len < space) {
- System.arraycopy(cbuf, off, buf, pos, len);
- pos += len;
- } else if (len<BUFSIZE) {
- // if the data to write is small enough, buffer it.
- System.arraycopy(cbuf, off, buf, pos, space);
+ public void write(char arr[], int off, int len) throws IOException {
+ for(;;) {
+ int space = buf.length - pos;
+
+ if (len <= space) {
+ System.arraycopy(arr, off, buf, pos, len);
+ pos += len;
+ return;
+ } else if (len > buf.length) {
+ if (pos>0) {
+ flush(buf,0,pos); // flush
+ pos=0;
+ }
+ // don't buffer, just write to sink
+ flush(arr, off, len);
+ return;
+ }
+
+ // buffer is too big to fit in the free space, but
+ // not big enough to warrant writing on its own.
+ // write whatever we can fit, then flush and iterate.
+
+ System.arraycopy(arr, off, buf, pos, space);
flush(buf, 0, buf.length);
- pos = len-space;
- System.arraycopy(cbuf, off+space, buf, 0, pos);
- } else {
- flush(buf,0,pos); // flush
- pos=0;
- // don't buffer, just write to sink
- flush(cbuf, off, len);
+ pos = 0;
+ off += space;
+ len -= space;
}
}
@Override
public void write(String str, int off, int len) throws IOException {
- int space = buf.length - pos;
- if (len < space) {
- str.getChars(off, off+len, buf, pos);
- pos += len;
- } else if (len<BUFSIZE) {
- // if the data to write is small enough, buffer it.
+ for(;;) {
+ int space = buf.length - pos;
+
+ if (len <= space) {
+ str.getChars(off, off+len, buf, pos);
+ pos += len;
+ return;
+ } else if (len > buf.length) {
+ if (pos>0) {
+ flush(buf,0,pos); // flush
+ pos=0;
+ }
+ // don't buffer, just write to sink
+ flush(str, off, len);
+ return;
+ }
+
+ // buffer is too big to fit in the free space, but
+ // not big enough to warrant writing on its own.
+ // write whatever we can fit, then flush and iterate.
+
str.getChars(off, off+space, buf, pos);
flush(buf, 0, buf.length);
- str.getChars(off+space, off+len, buf, 0);
- pos = len-space;
- } else {
- flush(buf,0,pos); // flush
- pos=0;
- // don't buffer, just write to sink
- flush(str, off, len);
+ pos = 0;
+ off += space;
+ len -= space;
}
}
Modified: lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/SimplePostTool.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/SimplePostTool.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/java/org/apache/solr/util/SimplePostTool.java Sun Aug 19 09:35:25 2012
@@ -17,65 +17,110 @@ package org.apache.solr.util;
* limitations under the License.
*/
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
+import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
-import java.util.Locale;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
/**
* A simple utility class for posting raw updates to a Solr server,
* has a main method so it can be run on the command line.
+ * View this not as a best-practice code example, but as a standalone
+ * example built with an explicit purpose of not having external
+ * jar dependencies.
*/
public class SimplePostTool {
- public static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
- public static final String VERSION_OF_THIS_TOOL = "1.5";
+ private static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
+ private static final String VERSION_OF_THIS_TOOL = "1.5";
private static final String DEFAULT_COMMIT = "yes";
private static final String DEFAULT_OPTIMIZE = "no";
private static final String DEFAULT_OUT = "no";
private static final String DEFAULT_AUTO = "no";
- private static final String DEFAULT_RECURSIVE = "no";
-
+ private static final String DEFAULT_RECURSIVE = "0";
+ private static final int DEFAULT_WEB_DELAY = 10;
+ private static final int MAX_WEB_DEPTH = 10;
private static final String DEFAULT_CONTENT_TYPE = "application/xml";
private static final String DEFAULT_FILE_TYPES = "xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
- private static final String DATA_MODE_FILES = "files";
- private static final String DATA_MODE_ARGS = "args";
- private static final String DATA_MODE_STDIN = "stdin";
- private static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
-
- private static final String TRUE_STRINGS = "true,on,yes,1";
-
- private boolean auto = false;
- private boolean recursive = false;
- private String fileTypes;
-
- private static HashMap<String,String> mimeMap;
- private GlobFileFilter globFileFilter;
-
- private static final Set<String> DATA_MODES = new HashSet<String>();
- private static final String USAGE_STRING_SHORT =
- "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|arg> [<file|folder|arg>...]]";
+ static final String DATA_MODE_FILES = "files";
+ static final String DATA_MODE_ARGS = "args";
+ static final String DATA_MODE_STDIN = "stdin";
+ static final String DATA_MODE_WEB = "web";
+ static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
+
+ // Input args
+ boolean auto = false;
+ int recursive = 0;
+ int delay = 0;
+ String fileTypes;
+ URL solrUrl;
+ OutputStream out = null;
+ String type;
+ String mode;
+ boolean commit;
+ boolean optimize;
+ String[] args;
+
+ private int currentDepth;
+
+ static HashMap<String,String> mimeMap;
+ GlobFileFilter globFileFilter;
+ // Backlog for crawling
+ List<LinkedHashSet<URL>> backlog = new ArrayList<LinkedHashSet<URL>>();
+ Set<URL> visited = new HashSet<URL>();
+
+ static final Set<String> DATA_MODES = new HashSet<String>();
+ static final String USAGE_STRING_SHORT =
+ "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|url|arg> [<file|folder|url|arg>...]]";
+
+ // Used in tests to avoid doing actual network traffic
+ static boolean mockMode = false;
+ static PageFetcher pageFetcher;
static {
DATA_MODES.add(DATA_MODE_FILES);
DATA_MODES.add(DATA_MODE_ARGS);
DATA_MODES.add(DATA_MODE_STDIN);
+ DATA_MODES.add(DATA_MODE_WEB);
mimeMap = new HashMap<String,String>();
mimeMap.put("xml", "text/xml");
@@ -100,97 +145,196 @@ public class SimplePostTool {
mimeMap.put("txt", "text/plain");
mimeMap.put("log", "text/plain");
}
-
- protected URL solrUrl;
+ /**
+ * See usage() for valid command line usage
+ * @param args the params on the command line
+ */
public static void main(String[] args) {
info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
-
if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
usage();
+ } else {
+ final SimplePostTool t = parseArgsAndInit(args);
+ t.execute();
+ }
+ }
+
+ /**
+ * After initialization, call execute to start the post job.
+ * This method delegates to the correct mode method.
+ */
+ public void execute() {
+ if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
+ doFilesMode();
+ } else if(DATA_MODE_ARGS.equals(mode) && args.length > 0) {
+ doArgsMode();
+ } else if(DATA_MODE_WEB.equals(mode) && args.length > 0) {
+ doWebMode();
+ } else if(DATA_MODE_STDIN.equals(mode)) {
+ doStdinMode();
+ } else {
+ usageShort();
return;
}
- OutputStream out = null;
- final String type = System.getProperty("type");
-
- final String params = System.getProperty("params", "");
-
- URL u = null;
+ if (commit) commit();
+ if (optimize) optimize();
+ }
+
+ /**
+ * Parses incoming arguments and system params and initializes the tool
+ * @param args the incoming cmd line args
+ * @return an instance of SimplePostTool
+ */
+ protected static SimplePostTool parseArgsAndInit(String[] args) {
+ String urlStr = null;
try {
- u = new URL(System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params)));
+ // Parse args
+ final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
+ if (! DATA_MODES.contains(mode)) {
+ fatal("System Property 'data' is not valid for this tool: " + mode);
+ }
+ String params = System.getProperty("params", "");
+ urlStr = System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params));
+ URL url = new URL(urlStr);
+ boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
+ String type = System.getProperty("type");
+ // Recursive
+ int recursive = 0;
+ String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
+ try {
+ recursive = Integer.parseInt(r);
+ } catch(Exception e) {
+ if (isOn(r))
+ recursive = DATA_MODE_WEB.equals(mode)?1:999;
+ }
+ // Delay
+ int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
+ try {
+ delay = Integer.parseInt(System.getProperty("delay", ""+delay));
+ } catch(Exception e) { }
+ OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
+ String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
+ boolean commit = isOn(System.getProperty("commit",DEFAULT_COMMIT));
+ boolean optimize = isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE));
+
+ return new SimplePostTool(mode, url, auto, type, recursive, delay, fileTypes, out, commit, optimize, args);
} catch (MalformedURLException e) {
- fatal("System Property 'url' is not a valid URL: " + u);
+ fatal("System Property 'url' is not a valid URL: " + urlStr);
+ return null;
}
- final SimplePostTool t = new SimplePostTool(u);
+ }
- if (isOn(System.getProperty("auto", DEFAULT_AUTO))) {
- t.setAuto(true);
- }
-
- if (isOn(System.getProperty("recursive", DEFAULT_RECURSIVE))) {
- t.setRecursive(true);
- }
+ /**
+ * Constructor which takes in all mandatory input for the tool to work.
+ * Also see usage() for further explanation of the params.
+ * @param mode whether to post files, web pages, params or stdin
+ * @param url the Solr base Url to post to, should end with /update
+ * @param auto if true, we'll guess type and add resourcename/url
+ * @param type content-type of the data you are posting
+ * @param recursive number of levels for file/web mode, or 0 if one file only
+ * @param delay if recursive then delay will be the wait time between posts
+ * @param fileTypes a comma separated list of file-name endings to accept for file/web
+ * @param out an OutputStream to write output to, e.g. stdout to print to console
+ * @param commit if true, will commit at end of posting
+ * @param optimize if true, will optimize at end of posting
+ * @param args a String[] of arguments, varies between modes
+ */
+ public SimplePostTool(String mode, URL url, boolean auto, String type,
+ int recursive, int delay, String fileTypes, OutputStream out,
+ boolean commit, boolean optimize, String[] args) {
+ this.mode = mode;
+ this.solrUrl = url;
+ this.auto = auto;
+ this.type = type;
+ this.recursive = recursive;
+ this.delay = delay;
+ this.fileTypes = fileTypes;
+ this.globFileFilter = getFileFilterFromFileTypes(fileTypes);
+ this.out = out;
+ this.commit = commit;
+ this.optimize = optimize;
+ this.args = args;
+ pageFetcher = new PageFetcher();
+ }
- final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
- if (! DATA_MODES.contains(mode)) {
- fatal("System Property 'data' is not valid for this tool: " + mode);
+ public SimplePostTool() {}
+
+ //
+ // Do some action depending on which mode we have
+ //
+ private void doFilesMode() {
+ currentDepth = 0;
+ // Skip posting files if special param "-" given
+ if (!args[0].equals("-")) {
+ info("Posting files to base url " + solrUrl + (!auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
+ if(auto)
+ info("Entering auto mode. File endings considered are "+fileTypes);
+ if(recursive > 0)
+ info("Entering recursive mode, max depth="+recursive+", delay="+delay+"s");
+ int numFilesPosted = postFiles(args, 0, out, type);
+ info(numFilesPosted + " files indexed.");
}
+ }
- if (isOn(System.getProperty("out", DEFAULT_OUT))) {
- out = System.out;
+ private void doArgsMode() {
+ info("POSTing args to " + solrUrl + "..");
+ for (String a : args) {
+ postData(stringToStream(a), null, out, type, solrUrl);
}
-
- t.setFileTypes(System.getProperty("filetypes", DEFAULT_FILE_TYPES));
+ }
- int numFilesPosted = 0;
-
+ private int doWebMode() {
+ reset();
+ int numPagesPosted = 0;
try {
- if (DATA_MODE_FILES.equals(mode)) {
- if (0 < args.length) {
- // Skip posting files if special param "-" given
- if (!args[0].equals("-")) {
- info("Posting files to base url " + u + (!t.auto?" using content-type "+(type==null?DEFAULT_CONTENT_TYPE:type):"")+"..");
- if(t.auto)
- info("Entering auto mode. File endings considered are "+t.getFileTypes());
- if(t.recursive)
- info("Entering recursive mode");
- numFilesPosted = t.postFiles(args, 0, out, type);
- info(numFilesPosted + " files indexed.");
- }
- } else {
- usageShort();
- return;
- }
- } else if (DATA_MODE_ARGS.equals(mode)) {
- if (0 < args.length) {
- info("POSTing args to " + u + "..");
- for (String a : args) {
- t.postData(SimplePostTool.stringToStream(a), null, out, type);
- }
- } else {
- usageShort();
- return;
+ if(type != null) {
+ fatal("Specifying content-type with \"-Ddata=web\" is not supported");
+ }
+ if (args[0].equals("-")) {
+ // Skip posting url if special param "-" given
+ return 0;
+ }
+ // Set Extracting handler as default
+ solrUrl = appendUrlPath(solrUrl, "/extract");
+
+ info("Posting web pages to Solr url "+solrUrl);
+ auto=true;
+ info("Entering auto mode. Indexing pages with content-types corresponding to file endings "+fileTypes);
+ if(recursive > 0) {
+ if(recursive > MAX_WEB_DEPTH) {
+ recursive = MAX_WEB_DEPTH;
+ warn("Too large recursion depth for web mode, limiting to "+MAX_WEB_DEPTH+"...");
}
- } else if (DATA_MODE_STDIN.equals(mode)) {
- info("POSTing stdin to " + u + "..");
- t.postData(System.in, null, out, type);
- }
- if (isOn(System.getProperty("commit",DEFAULT_COMMIT))) {
- info("COMMITting Solr index changes to " + u + "..");
- t.commit();
- }
- if (isOn(System.getProperty("optimize",DEFAULT_OPTIMIZE))) {
- info("Performing an OPTIMIZE to " + u + "..");
- t.optimize();
+ if(delay < DEFAULT_WEB_DELAY)
+ warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
+ info("Entering recursive mode, depth="+recursive+", delay="+delay+"s");
}
-
- } catch(RuntimeException e) {
- e.printStackTrace();
- fatal("RuntimeException " + e);
+ numPagesPosted = postWebPages(args, 0, out);
+ info(numPagesPosted + " web pages indexed.");
+ } catch(MalformedURLException e) {
+ fatal("Wrong URL trying to append /extract to "+solrUrl);
}
+ return numPagesPosted;
+ }
+
+ private void doStdinMode() {
+ info("POSTing stdin to " + solrUrl + "..");
+ postData(System.in, null, out, type, solrUrl);
}
+ private void reset() {
+ fileTypes = DEFAULT_FILE_TYPES;
+ globFileFilter = this.getFileFilterFromFileTypes(fileTypes);
+ backlog = new ArrayList<LinkedHashSet<URL>>();
+ visited = new HashSet<URL>();
+ }
+
+
+ //
+ // USAGE
+ //
private static void usageShort() {
System.out.println(USAGE_STRING_SHORT+"\n"+
" Please invoke with -h option for extended usage help.");
@@ -200,11 +344,12 @@ public class SimplePostTool {
System.out.println
(USAGE_STRING_SHORT+"\n\n" +
"Supported System Properties and their defaults:\n"+
- " -Ddata=files|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
+ " -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"+
" -Dtype=<content-type> (default=" + DEFAULT_CONTENT_TYPE + ")\n"+
" -Durl=<solr-update-url> (default=" + DEFAULT_POST_URL + ")\n"+
" -Dauto=yes|no (default=" + DEFAULT_AUTO + ")\n"+
- " -Drecursive=yes|no (default=" + DEFAULT_RECURSIVE + ")\n"+
+ " -Drecursive=yes|no|<depth> (default=" + DEFAULT_RECURSIVE + ")\n"+
+ " -Ddelay=<seconds> (default=0 for files, 10 for web)\n"+
" -Dfiletypes=<type>[,<type>,...] (default=" + DEFAULT_FILE_TYPES + ")\n"+
" -Dparams=\"<key>=<value>[&<key>=<value>...]\" (values must be URL-encoded)\n"+
" -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n"+
@@ -212,11 +357,12 @@ public class SimplePostTool {
" -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"+
"This is a simple command line tool for POSTing raw data to a Solr\n"+
"port. Data can be read from files specified as commandline args,\n"+
- "as raw commandline arg strings, or via STDIN.\n"+
+ "URLs specified as args, as raw commandline arg strings or via STDIN.\n"+
"Examples:\n"+
" java -jar post.jar *.xml\n"+
" java -Ddata=args -jar post.jar '<delete><id>42</id></delete>'\n"+
" java -Ddata=stdin -jar post.jar < hd.xml\n"+
+ " java -Ddata=web -jar post.jar http://example.com/\n"+
" java -Dtype=text/csv -jar post.jar *.csv\n"+
" java -Dtype=application/json -jar post.jar *.json\n"+
" java -Durl=http://localhost:8983/solr/update/extract -Dparams=literal.id=a -Dtype=application/pdf -jar post.jar a.pdf\n"+
@@ -228,13 +374,10 @@ public class SimplePostTool {
"or optimize should be executed, and whether the response should\n"+
"be written to STDOUT. If auto=yes the tool will try to set type\n"+
"and url automatically from file name. When posting rich documents\n"+
- "the file name will be propagated as \"resource.name\" and also used as \"literal.id\".\n" +
- "You may override these or any other request parameter through the -Dparams property.\n"+
- "If you want to do a commit only, use \"-\" as argument.");
- }
-
- private static boolean isOn(String property) {
- return(TRUE_STRINGS.indexOf(property) >= 0);
+ "the file name will be propagated as \"resource.name\" and also used\n"+
+ "as \"literal.id\". You may override these or any other request parameter\n"+
+ "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"+
+ "The web mode is a simple crawler following links within domain, default delay=10s.");
}
/** Post all filenames provided in args
@@ -244,7 +387,8 @@ public class SimplePostTool {
* @param type default content-type to use when posting (may be overridden in auto mode)
* @return number of files posted
* */
- int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
+ public int postFiles(String [] args,int startIndexInArgs, OutputStream out, String type) {
+ reset();
int filesPosted = 0;
for (int j = startIndexInArgs; j < args.length; j++) {
File srcFile = new File(args[j]);
@@ -258,7 +402,7 @@ public class SimplePostTool {
String fileGlob = srcFile.getName();
GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
File[] files = parent.listFiles(ff);
- if(files.length == 0) {
+ if(files == null || files.length == 0) {
warn("No files or directories matching "+srcFile);
continue;
}
@@ -268,32 +412,255 @@ public class SimplePostTool {
return filesPosted;
}
+ /** Post all filenames provided in args
+ * @param files array of Files
+ * @param startIndexInArgs offset to start
+ * @param out output stream to post data to
+ * @param type default content-type to use when posting (may be overridden in auto mode)
+ * @return number of files posted
+ * */
+ public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {
+ reset();
+ int filesPosted = 0;
+ for (File srcFile : files) {
+ if(srcFile.isDirectory() && srcFile.canRead()) {
+ filesPosted += postDirectory(srcFile, out, type);
+ } else if (srcFile.isFile() && srcFile.canRead()) {
+ filesPosted += postFiles(new File[] {srcFile}, out, type);
+ } else {
+ File parent = srcFile.getParentFile();
+ if(parent == null) parent = new File(".");
+ String fileGlob = srcFile.getName();
+ GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
+ File[] fileList = parent.listFiles(ff);
+ if(fileList == null || fileList.length == 0) {
+ warn("No files or directories matching "+srcFile);
+ continue;
+ }
+ filesPosted += postFiles(fileList, out, type);
+ }
+ }
+ return filesPosted;
+ }
+
+ /**
+ * Posts a whole directory
+ * @return number of files posted total
+ */
private int postDirectory(File dir, OutputStream out, String type) {
if(dir.isHidden() && !dir.getName().equals("."))
return(0);
- info("Indexing directory "+dir.getPath());
+ info("Indexing directory "+dir.getPath()+" ("+dir.listFiles(globFileFilter).length+" files, depth="+currentDepth+")");
int posted = 0;
posted += postFiles(dir.listFiles(globFileFilter), out, type);
- if(recursive) {
+ if(recursive > currentDepth) {
for(File d : dir.listFiles()) {
- if(d.isDirectory())
+ if(d.isDirectory()) {
+ currentDepth++;
posted += postDirectory(d, out, type);
+ currentDepth--;
+ }
}
}
return posted;
}
+ /**
+ * Posts a list of file names
+ * @return number of files posted
+ */
int postFiles(File[] files, OutputStream out, String type) {
int filesPosted = 0;
for(File srcFile : files) {
- if(!srcFile.isFile() || srcFile.isHidden())
- continue;
- postFile(srcFile, out, type);
- filesPosted++;
+ try {
+ if(!srcFile.isFile() || srcFile.isHidden())
+ continue;
+ postFile(srcFile, out, type);
+ Thread.sleep(delay * 1000);
+ filesPosted++;
+ } catch (InterruptedException e) {
+ throw new RuntimeException();
+ }
}
return filesPosted;
}
+ /**
+ * This method takes as input a list of start URL strings for crawling,
+ * adds each one to a backlog and then starts crawling
+ * @param args the raw input args from main()
+ * @param startIndexInArgs offset for where to start
+ * @param out outputStream to write results to
+ * @return the number of web pages posted
+ */
+ public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
+ reset();
+ LinkedHashSet<URL> s = new LinkedHashSet<URL>();
+ for (int j = startIndexInArgs; j < args.length; j++) {
+ try {
+ URL u = new URL(normalizeUrlEnding(args[j]));
+ s.add(u);
+ } catch(MalformedURLException e) {
+ warn("Skipping malformed input URL: "+args[j]);
+ }
+ }
+ // Add URLs to level 0 of the backlog and start recursive crawling
+ backlog.add(s);
+ return webCrawl(0, out);
+ }
+
+ /**
+ * Normalizes a URL string by removing anchor part and trailing slash
+ * @return the normalized URL string
+ */
+ protected static String normalizeUrlEnding(String link) {
+ if(link.indexOf("#") > -1)
+ link = link.substring(0,link.indexOf("#"));
+ if(link.endsWith("?"))
+ link = link.substring(0,link.length()-1);
+ if(link.endsWith("/"))
+ link = link.substring(0,link.length()-1);
+ return link;
+ }
+
+ /**
+ * A very simple crawler, pulling URLs to fetch from a backlog and then
+ * recurses N levels deep if recursive>0. Links are parsed from HTML
+ * through first getting an XHTML version using SolrCell with extractOnly,
+ * and followed if they are local. The crawler pauses for a default delay
+ * of 10 seconds between each fetch, this can be configured in the delay
+ * variable. This is only meant for test purposes, as it does not respect
+ * robots or anything else fancy :)
+ * @param level which level to crawl
+ * @param out output stream to write to
+ * @return number of pages crawled on this level and below
+ */
+ protected int webCrawl(int level, OutputStream out) {
+ int numPages = 0;
+ LinkedHashSet<URL> stack = backlog.get(level);
+ int rawStackSize = stack.size();
+ stack.removeAll(visited);
+ int stackSize = stack.size();
+ LinkedHashSet<URL> subStack = new LinkedHashSet<URL>();
+ info("Entering crawl at level "+level+" ("+rawStackSize+" links total, "+stackSize+" new)");
+ for(URL u : stack) {
+ try {
+ visited.add(u);
+ PageFetcherResult result = pageFetcher.readPageFromUrl(u);
+ if(result.httpStatus == 200) {
+ u = (result.redirectUrl != null) ? result.redirectUrl : u;
+ URL postUrl = new URL(appendParam(solrUrl.toString(),
+ "literal.id="+URLEncoder.encode(u.toString(),"UTF-8") +
+ "&literal.url="+URLEncoder.encode(u.toString(),"UTF-8")));
+ boolean success = postData(new ByteArrayInputStream(result.content), null, out, result.contentType, postUrl);
+ if (success) {
+ info("POSTed web resource "+u+" (depth: "+level+")");
+ Thread.sleep(delay * 1000);
+ numPages++;
+ // Pull links from HTML pages only
+ if(recursive > level && result.contentType.equals("text/html")) {
+ Set<URL> children = pageFetcher.getLinksFromWebPage(u, new ByteArrayInputStream(result.content), result.contentType, postUrl);
+ subStack.addAll(children);
+ }
+ } else {
+ warn("An error occurred while posting "+u);
+ }
+ } else {
+ warn("The URL "+u+" returned a HTTP result status of "+result.httpStatus);
+ }
+ } catch (IOException e) {
+ warn("Caught exception when trying to open connection to "+u+": "+e.getMessage());
+ } catch (InterruptedException e) {
+ throw new RuntimeException();
+ }
+ }
+ if(!subStack.isEmpty()) {
+ backlog.add(subStack);
+ numPages += webCrawl(level+1, out);
+ }
+ return numPages;
+ }
+
+ /**
+ * Reads an input stream into a byte array
+ * @param is the input stream
+ * @return the byte array
+ * @throws IOException
+ */
+ protected byte[] inputStreamToByteArray(InputStream is) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ int next = is.read();
+ while (next > -1) {
+ bos.write(next);
+ next = is.read();
+ }
+ bos.flush();
+ is.close();
+ return bos.toByteArray();
+ }
+
+ /**
+ * Computes the full URL based on a base url and a possibly relative link found
+ * in the href param of an HTML anchor.
+ * @param baseUrl the base url from where the link was found
+ * @param link the absolute or relative link
+ * @return the string version of the full URL
+ */
+ protected String computeFullUrl(URL baseUrl, String link) {
+ if(link == null || link.length() == 0) {
+ return null;
+ }
+ if(!link.startsWith("http")) {
+ if(link.startsWith("/")) {
+ link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
+ } else {
+ if(link.contains(":")) {
+ return null; // Skip non-relative URLs
+ }
+ String path = baseUrl.getPath();
+ if(!path.endsWith("/")) {
+ int sep = path.lastIndexOf("/");
+ String file = path.substring(sep+1);
+ if(file.contains(".") || file.contains("?"))
+ path = path.substring(0,sep);
+ }
+ link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
+ }
+ }
+ link = normalizeUrlEnding(link);
+ String l = link.toLowerCase(Locale.ROOT);
+ // Simple brute force skip images
+ if(l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
+ return null; // Skip images
+ }
+ return link;
+ }
+
+ /**
+ * Uses the mime-type map to reverse lookup whether the file ending for our type
+ * is supported by the fileTypes option
+ * @param type what content-type to lookup
+ * @return true if this is a supported content type
+ */
+ protected boolean typeSupported(String type) {
+ for(String key : mimeMap.keySet()) {
+ if(mimeMap.get(key).equals(type)) {
+ if(fileTypes.contains(key))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Tests if a string is either "true", "on", "yes" or "1"
+ * @param property the string to test
+ * @return true if "on"
+ */
+ protected static boolean isOn(String property) {
+ return("true,on,yes,1".indexOf(property) > -1);
+ }
+
static void warn(String msg) {
System.err.println("SimplePostTool: WARNING: " + msg);
}
@@ -304,21 +671,14 @@ public class SimplePostTool {
static void fatal(String msg) {
System.err.println("SimplePostTool: FATAL: " + msg);
- System.exit(1);
- }
-
- /**
- * Constructs an instance for posting data to the specified Solr URL
- * (ie: "http://localhost:8983/solr/update")
- */
- public SimplePostTool(URL solrUrl) {
- this.solrUrl = solrUrl;
+ System.exit(2);
}
/**
* Does a simple commit operation
*/
public void commit() {
+ info("COMMITting Solr index changes to " + solrUrl + "..");
doGet(appendParam(solrUrl.toString(), "commit=true"));
}
@@ -326,9 +686,16 @@ public class SimplePostTool {
* Does a simple optimize operation
*/
public void optimize() {
+ info("Performing an OPTIMIZE to " + solrUrl + "..");
doGet(appendParam(solrUrl.toString(), "optimize=true"));
}
+ /**
+ * Appends a URL query parameter to a URL
+ * @param url the original URL
+ * @param param the parameter(s) to append, separated by "&"
+ * @return the string version of the resulting URL
+ */
public static String appendParam(String url, String param) {
String[] pa = param.split("&");
for(String p : pa) {
@@ -360,13 +727,12 @@ public class SimplePostTool {
// Default handler
} else {
// SolrCell
- String urlStr = url.getProtocol() + "://" + url.getAuthority() + url.getPath() + "/extract" + (url.getQuery() != null ? "?"+url.getQuery() : "");
+ String urlStr = appendUrlPath(solrUrl, "/extract").toString();
if(urlStr.indexOf("resource.name")==-1)
urlStr = appendParam(urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
if(urlStr.indexOf("literal.id")==-1)
urlStr = appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
url = new URL(urlStr);
-// info("Indexing to ExtractingRequestHandler with URL "+url);
}
} else {
warn("Skipping "+file.getName()+". Unsupported file type for auto mode.");
@@ -390,7 +756,23 @@ public class SimplePostTool {
}
}
- private String guessType(File file) {
+ /**
+ * Appends to the path of the URL
+ * @param url the URL
+ * @param append the path to append
+ * @return the final URL version
+ * @throws MalformedURLException
+ */
+ protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
+ return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append + (url.getQuery() != null ? "?"+url.getQuery() : ""));
+ }
+
+ /**
+ * Guesses the type of a file, based on file name suffix
+ * @param file the file
+ * @return the content-type guessed
+ */
+ protected static String guessType(File file) {
String name = file.getName();
String suffix = name.substring(name.lastIndexOf(".")+1);
return mimeMap.get(suffix.toLowerCase(Locale.ROOT));
@@ -412,6 +794,7 @@ public class SimplePostTool {
*/
public static void doGet(URL url) {
try {
+ if(mockMode) return;
HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
warn("Solr returned an error #" + urlc.getResponseCode() +
@@ -422,15 +805,14 @@ public class SimplePostTool {
}
}
- public void postData(InputStream data, Integer length, OutputStream output, String type) {
- postData(data, length, output, type, solrUrl);
- }
-
/**
* Reads data from the data stream and posts it to solr,
* writes to the response to output
+ * @return true if success
*/
- public void postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
+ public boolean postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
+ if(mockMode) return true;
+ boolean success = true;
if(type == null)
type = DEFAULT_CONTENT_TYPE;
HttpURLConnection urlc = null;
@@ -441,7 +823,6 @@ public class SimplePostTool {
urlc.setRequestMethod("POST");
} catch (ProtocolException e) {
fatal("Shouldn't happen: HttpURLConnection doesn't support POST??"+e);
-
}
urlc.setDoOutput(true);
urlc.setDoInput(true);
@@ -453,6 +834,7 @@ public class SimplePostTool {
} catch (IOException e) {
fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
+ success = false;
}
OutputStream out = null;
@@ -461,6 +843,7 @@ public class SimplePostTool {
pipe(data, out);
} catch (IOException e) {
fatal("IOException while posting data: " + e);
+ success = false;
} finally {
try { if(out!=null) out.close(); } catch (IOException x) { /*NOOP*/ }
}
@@ -470,12 +853,14 @@ public class SimplePostTool {
if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
warn("Solr returned an error #" + urlc.getResponseCode() +
" " + urlc.getResponseMessage());
+ success = false;
}
in = urlc.getInputStream();
pipe(in, output);
} catch (IOException e) {
warn("IOException while reading response: " + e);
+ success = false;
} finally {
try { if(in!=null) in.close(); } catch (IOException x) { /*NOOP*/ }
}
@@ -483,8 +868,14 @@ public class SimplePostTool {
} finally {
if(urlc!=null) urlc.disconnect();
}
+ return success;
}
+ /**
+ * Converts a string to an input stream
+ * @param s the string
+ * @return the input stream
+ */
public static InputStream stringToStream(String s) {
InputStream is = null;
try {
@@ -508,36 +899,64 @@ public class SimplePostTool {
if (null != dest) dest.flush();
}
- public boolean isAuto() {
- return auto;
- }
-
- public void setAuto(boolean auto) {
- this.auto = auto;
- }
-
- public boolean isRecursive() {
- return recursive;
- }
-
- public void setRecursive(boolean recursive) {
- this.recursive = recursive;
- }
-
- public String getFileTypes() {
- return fileTypes;
- }
-
- public void setFileTypes(String fileTypes) {
- this.fileTypes = fileTypes;
+ public GlobFileFilter getFileFilterFromFileTypes(String fileTypes) {
String glob;
if(fileTypes.equals("*"))
glob = ".*";
else
glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
- this.globFileFilter = new GlobFileFilter(glob, true);
+ return new GlobFileFilter(glob, true);
+ }
+
+ //
+ // Utility methods for XPath handing
+ //
+
+ /**
+ * Gets all nodes matching an XPath
+ */
+ public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
+ XPathFactory factory = XPathFactory.newInstance();
+ XPath xp = factory.newXPath();
+ XPathExpression expr = xp.compile(xpath);
+ return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
+ }
+
+ /**
+ * Gets the string content of the matching an XPath
+ * @param n the node (or doc)
+ * @param xpath the xpath string
+ * @param concatAll if true, text from all matching nodes will be concatenated, else only the first returned
+ */
+ public static String getXP(Node n, String xpath, boolean concatAll)
+ throws XPathExpressionException {
+ NodeList nodes = getNodesFromXP(n, xpath);
+ StringBuffer sb = new StringBuffer();
+ if (nodes.getLength() > 0) {
+ for(int i = 0; i < nodes.getLength() ; i++) {
+ sb.append(nodes.item(i).getNodeValue() + " ");
+ if(!concatAll) break;
+ }
+ return sb.toString().trim();
+ } else
+ return "";
+ }
+
+ /**
+ * Takes a string as input and returns a DOM
+ */
+ public static Document makeDom(String in, String inputEncoding) throws SAXException, IOException,
+ ParserConfigurationException {
+ InputStream is = new ByteArrayInputStream(in
+ .getBytes(inputEncoding));
+ Document dom = DocumentBuilderFactory.newInstance()
+ .newDocumentBuilder().parse(is);
+ return dom;
}
+ /**
+ * Inner class to filter files based on glob wildcards
+ */
class GlobFileFilter implements FileFilter
{
private String _pattern;
@@ -571,4 +990,170 @@ public class SimplePostTool {
return p.matcher(file.getName()).find();
}
}
+
+ //
+ // Simple crawler class which can fetch a page and check for robots.txt
+ //
+ class PageFetcher {
+ Map<String, List<String>> robotsCache;
+ final String DISALLOW = "Disallow:";
+
+ public PageFetcher() {
+ robotsCache = new HashMap<String,List<String>>();
+ }
+
+ public PageFetcherResult readPageFromUrl(URL u) {
+ PageFetcherResult res = new PageFetcherResult();
+ try {
+ if (isDisallowedByRobots(u)) {
+ warn("The URL "+u+" is disallowed by robots.txt and will not be crawled.");
+ res.httpStatus = 403;
+ visited.add(u);
+ return res;
+ }
+ res.httpStatus = 404;
+ HttpURLConnection conn = (HttpURLConnection) u.openConnection();
+ conn.setRequestProperty("User-Agent", "SimplePostTool-crawler/"+VERSION_OF_THIS_TOOL+" (http://lucene.apache.org/solr/)");
+ conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+ conn.connect();
+ res.httpStatus = conn.getResponseCode();
+ if(!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
+ info("The URL "+u+" caused a redirect to "+conn.getURL());
+ u = conn.getURL();
+ res.redirectUrl = u;
+ visited.add(u);
+ }
+ if(res.httpStatus == 200) {
+ // Raw content type of form "text/html; encoding=utf-8"
+ String rawContentType = conn.getContentType();
+ String type = rawContentType.split(";")[0];
+ if(typeSupported(type)) {
+ String encoding = conn.getContentEncoding();
+ InputStream is;
+ if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+ is = new GZIPInputStream(conn.getInputStream());
+ } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
+ is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
+ } else {
+ is = conn.getInputStream();
+ }
+
+ // Read into memory, so that we later can pull links from the page without re-fetching
+ res.content = inputStreamToByteArray(is);
+ is.close();
+ } else {
+ warn("Skipping URL with unsupported type "+type);
+ res.httpStatus = 415;
+ }
+ }
+ } catch(IOException e) {
+ warn("IOException when reading page from url "+u+": "+e.getMessage());
+ }
+ return res;
+ }
+
+ public boolean isDisallowedByRobots(URL url) {
+ String host = url.getHost();
+ String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
+ List<String> disallows = robotsCache.get(host);
+ if(disallows == null) {
+ disallows = new ArrayList<String>();
+ URL urlRobot;
+ try {
+ urlRobot = new URL(strRobot);
+ disallows = parseRobotsTxt(urlRobot.openStream());
+ } catch (MalformedURLException e) {
+ return true; // We cannot trust this robots URL, should not happen
+ } catch (IOException e) {
+ // There is no robots.txt, will cache an empty disallow list
+ }
+ }
+
+ robotsCache.put(host, disallows);
+
+ String strURL = url.getFile();
+ for (String path : disallows) {
+ if (path.equals("/") || strURL.indexOf(path) == 0)
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Very simple robots.txt parser which obeys all Disallow lines regardless
+ * of user agent or whether there are valid Allow: lines.
+ * @param is Input stream of the robots.txt file
+ * @return a list of disallow paths
+ * @throws IOException if problems reading the stream
+ */
+ protected List<String> parseRobotsTxt(InputStream is) throws IOException {
+ List<String> disallows = new ArrayList<String>();
+ BufferedReader r = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ String l;
+ while((l = r.readLine()) != null) {
+ String[] arr = l.split("#");
+ if(arr.length == 0) continue;
+ l = arr[0].trim();
+ if(l.startsWith(DISALLOW)) {
+ l = l.substring(DISALLOW.length()).trim();
+ if(l.length() == 0) continue;
+ disallows.add(l);
+ }
+ }
+ is.close();
+ return disallows;
+ }
+
+ /**
+ * Finds links on a web page, using /extract?extractOnly=true
+ * @param u the URL of the web page
+ * @param is the input stream of the page
+ * @param type the content-type
+ * @param postUrl the URL (typically /solr/extract) in order to pull out links
+ * @return a set of URLs parsed from the page
+ */
+ protected Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
+ Set<URL> l = new HashSet<URL>();
+ URL url = null;
+ try {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
+ boolean success = postData(is, null, os, type, extractUrl);
+ if(success) {
+ String rawXml = os.toString("UTF-8");
+ Document d = makeDom(rawXml, "UTF-8");
+ String innerXml = getXP(d, "/response/str/text()[1]", false);
+ d = makeDom(innerXml, "UTF-8");
+ NodeList links = getNodesFromXP(d, "/html/body//a/@href");
+ for(int i = 0; i < links.getLength(); i++) {
+ String link = links.item(i).getTextContent();
+ link = computeFullUrl(u, link);
+ if(link == null)
+ continue;
+ url = new URL(link);
+ if(url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority()))
+ continue;
+ l.add(url);
+ }
+ }
+ } catch (MalformedURLException e) {
+ warn("Malformed URL "+url);
+ } catch (IOException e) {
+ warn("IOException opening URL "+url+": "+e.getMessage());
+ } catch (Exception e) {
+ throw new RuntimeException();
+ }
+ return l;
+ }
+ }
+
+ /**
+ * Utility class to hold the result form a page fetch
+ */
+ public class PageFetcherResult {
+ int httpStatus = 200;
+ String contentType = "text/html";
+ URL redirectUrl = null;
+ byte[] content;
+ }
}
Modified: lucene/dev/branches/lucene3312/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml Sun Aug 19 09:35:25 2012
@@ -54,7 +54,7 @@
-->
<maxBufferedDocs>10</maxBufferedDocs>
<mergePolicy class="org.apache.lucene.index.LogDocMergePolicy"/>
- <lockType>single</lockType>
+ <lockType>native</lockType>
<unlockOnStartup>true</unlockOnStartup>
</indexConfig>
Modified: lucene/dev/branches/lucene3312/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java (original)
+++ lucene/dev/branches/lucene3312/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java Sun Aug 19 09:35:25 2012
@@ -228,32 +228,35 @@ public class StatsComponentTest extends
}
public void doTestFacetStatisticsResult(String f) throws Exception {
- assertU(adoc("id", "1", f, "10", "active_s", "true"));
- assertU(adoc("id", "2", f, "20", "active_s", "true"));
- assertU(adoc("id", "3", f, "30", "active_s", "false"));
- assertU(adoc("id", "4", f, "40", "active_s", "false"));
+ assertU(adoc("id", "1", f, "10", "active_s", "true", "other_s", "foo"));
+ assertU(adoc("id", "2", f, "20", "active_s", "true", "other_s", "bar"));
+ assertU(adoc("id", "3", f, "30", "active_s", "false", "other_s", "foo"));
+ assertU(adoc("id", "4", f, "40", "active_s", "false", "other_s", "foo"));
assertU(commit());
- assertQ("test value for active_s=true", req("q","*:*", "stats","true", "stats.field",f, "stats.facet","active_s","indent","true")
- , "//lst[@name='true']/double[@name='min'][.='10.0']"
- , "//lst[@name='true']/double[@name='max'][.='20.0']"
- , "//lst[@name='true']/double[@name='sum'][.='30.0']"
- , "//lst[@name='true']/long[@name='count'][.='2']"
- , "//lst[@name='true']/long[@name='missing'][.='0']"
- , "//lst[@name='true']/double[@name='sumOfSquares'][.='500.0']"
- , "//lst[@name='true']/double[@name='mean'][.='15.0']"
- , "//lst[@name='true']/double[@name='stddev'][.='7.0710678118654755']"
+ final String pre = "//lst[@name='stats_fields']/lst[@name='"+f+"']/lst[@name='facets']/lst[@name='active_s']";
+
+ assertQ("test value for active_s=true", req("q","*:*", "stats","true", "stats.field",f, "stats.facet","active_s","stats.facet","other_s","indent","true")
+ , "*[count("+pre+")=1]"
+ , pre+"/lst[@name='true']/double[@name='min'][.='10.0']"
+ , pre+"/lst[@name='true']/double[@name='max'][.='20.0']"
+ , pre+"/lst[@name='true']/double[@name='sum'][.='30.0']"
+ , pre+"/lst[@name='true']/long[@name='count'][.='2']"
+ , pre+"/lst[@name='true']/long[@name='missing'][.='0']"
+ , pre+"/lst[@name='true']/double[@name='sumOfSquares'][.='500.0']"
+ , pre+"/lst[@name='true']/double[@name='mean'][.='15.0']"
+ , pre+"/lst[@name='true']/double[@name='stddev'][.='7.0710678118654755']"
);
assertQ("test value for active_s=false", req("q","*:*", "stats","true", "stats.field",f, "stats.facet","active_s")
- , "//lst[@name='false']/double[@name='min'][.='30.0']"
- , "//lst[@name='false']/double[@name='max'][.='40.0']"
- , "//lst[@name='false']/double[@name='sum'][.='70.0']"
- , "//lst[@name='false']/long[@name='count'][.='2']"
- , "//lst[@name='false']/long[@name='missing'][.='0']"
- , "//lst[@name='false']/double[@name='sumOfSquares'][.='2500.0']"
- , "//lst[@name='false']/double[@name='mean'][.='35.0']"
- , "//lst[@name='false']/double[@name='stddev'][.='7.0710678118654755']"
+ , pre+"/lst[@name='false']/double[@name='min'][.='30.0']"
+ , pre+"/lst[@name='false']/double[@name='max'][.='40.0']"
+ , pre+"/lst[@name='false']/double[@name='sum'][.='70.0']"
+ , pre+"/lst[@name='false']/long[@name='count'][.='2']"
+ , pre+"/lst[@name='false']/long[@name='missing'][.='0']"
+ , pre+"/lst[@name='false']/double[@name='sumOfSquares'][.='2500.0']"
+ , pre+"/lst[@name='false']/double[@name='mean'][.='35.0']"
+ , pre+"/lst[@name='false']/double[@name='stddev'][.='7.0710678118654755']"
);
}
Modified: lucene/dev/branches/lucene3312/solr/example/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/example/solr/collection1/conf/solrconfig.xml?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/example/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene3312/solr/example/solr/collection1/conf/solrconfig.xml Sun Aug 19 09:35:25 2012
@@ -954,7 +954,18 @@
</lst>
-->
</requestHandler>
-
+
+ <!-- for back compat with clients using /update/json and /update/csv -->
+ <requestHandler name="/update/json" class="solr.JsonUpdateRequestHandler">
+ <lst name="defaults">
+ <str name="stream.contentType">application/json</str>
+ </lst>
+ </requestHandler>
+ <requestHandler name="/update/csv" class="solr.CSVRequestHandler">
+ <lst name="defaults">
+ <str name="stream.contentType">application/csv</str>
+ </lst>
+ </requestHandler>
<!-- Solr Cell Update Request Handler
Modified: lucene/dev/branches/lucene3312/solr/example/solr/solr.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/example/solr/solr.xml?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/example/solr/solr.xml (original)
+++ lucene/dev/branches/lucene3312/solr/example/solr/solr.xml Sun Aug 19 09:35:25 2012
@@ -47,7 +47,7 @@
All of the attributes in cores after defaultCoreName only apply when running in SolrCloud mode.
You can read more about SolrCloud mode at http://wiki.apache.org/solr/SolrCloud
-->
- <cores adminPath="/admin/cores" defaultCoreName="collection1" host="${host:}" hostPort="${jetty.port:}" zkClientTimeout="${zkClientTimeout:15000}">
+ <cores adminPath="/admin/cores" defaultCoreName="collection1" host="${host:}" hostPort="${jetty.port:}" hostContext="${hostContext:}" zkClientTimeout="${zkClientTimeout:15000}">
<core name="collection1" instanceDir="collection1" />
</cores>
</solr>
Modified: lucene/dev/branches/lucene3312/solr/licenses/commons-compress-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/licenses/commons-compress-NOTICE.txt?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/licenses/commons-compress-NOTICE.txt (original)
+++ lucene/dev/branches/lucene3312/solr/licenses/commons-compress-NOTICE.txt Sun Aug 19 09:35:25 2012
@@ -1,5 +1,5 @@
Apache Commons Compress
-Copyright 2002-2011 The Apache Software Foundation
+Copyright 2002-2012 The Apache Software Foundation
This product includes software developed by
The Apache Software Foundation (http://www.apache.org/).
Modified: lucene/dev/branches/lucene3312/solr/licenses/fontbox-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/licenses/fontbox-NOTICE.txt?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/licenses/fontbox-NOTICE.txt (original)
+++ lucene/dev/branches/lucene3312/solr/licenses/fontbox-NOTICE.txt Sun Aug 19 09:35:25 2012
@@ -1,6 +1,6 @@
Apache FontBox
-Copyright 2008-2010 The Apache Software Foundation
+Copyright 2008-2012 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Modified: lucene/dev/branches/lucene3312/solr/licenses/jempbox-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/licenses/jempbox-NOTICE.txt?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/licenses/jempbox-NOTICE.txt (original)
+++ lucene/dev/branches/lucene3312/solr/licenses/jempbox-NOTICE.txt Sun Aug 19 09:35:25 2012
@@ -1,6 +1,6 @@
Apache JempBox
-Copyright 2008-2010 The Apache Software Foundation
+Copyright 2008-2012 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Modified: lucene/dev/branches/lucene3312/solr/licenses/pdfbox-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/licenses/pdfbox-NOTICE.txt?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/licenses/pdfbox-NOTICE.txt (original)
+++ lucene/dev/branches/lucene3312/solr/licenses/pdfbox-NOTICE.txt Sun Aug 19 09:35:25 2012
@@ -1,10 +1,14 @@
-
Apache PDFBox
-Copyright 2002-2010 The Apache Software Foundation
+Copyright 2011 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
-
-Based on source code contributed to the original PDFBox project.
+Based on source code originally developed in the PDFBox, JempBox and
+FontBox projects.
Copyright (c) 2002-2007, www.pdfbox.org
+Copyright (c) 2006-2007, www.jempbox.org
+
+Based on source code originally developed in the PaDaF project.
+Copyright (c) 2010 Atos Worldline SAS
+
Modified: lucene/dev/branches/lucene3312/solr/licenses/tika-core-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/licenses/tika-core-NOTICE.txt?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/licenses/tika-core-NOTICE.txt (original)
+++ lucene/dev/branches/lucene3312/solr/licenses/tika-core-NOTICE.txt Sun Aug 19 09:35:25 2012
@@ -1,8 +1,15 @@
-
-Apache Tika core
-Copyright 2007-2010 The Apache Software Foundation
+Apache Tika
+Copyright 2011 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
+Copyright 1993-2010 University Corporation for Atmospheric Research/Unidata
+This software contains code derived from UCAR/Unidata's NetCDF library.
+
+Tika-server compoment uses CDDL-licensed dependencies: jersey (http://jersey.java.net/) and
+Grizzly (http://grizzly.java.net/)
+
+OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
+IPTC Photo Metadata descriptions Copyright 2010 International Press Telecommunications Council.
Modified: lucene/dev/branches/lucene3312/solr/licenses/tika-parsers-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/licenses/tika-parsers-NOTICE.txt?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/licenses/tika-parsers-NOTICE.txt (original)
+++ lucene/dev/branches/lucene3312/solr/licenses/tika-parsers-NOTICE.txt Sun Aug 19 09:35:25 2012
@@ -1,8 +1,15 @@
-
Apache Tika parsers
-Copyright 2007-2010 The Apache Software Foundation
+Copyright 2011 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
+Copyright 1993-2010 University Corporation for Atmospheric Research/Unidata
+This software contains code derived from UCAR/Unidata's NetCDF library.
+
+Tika-server compoment uses CDDL-licensed dependencies: jersey (http://jersey.java.net/) and
+Grizzly (http://grizzly.java.net/)
+
+OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
+IPTC Photo Metadata descriptions Copyright 2010 International Press Telecommunications Council.
Modified: lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java (original)
+++ lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java Sun Aug 19 09:35:25 2012
@@ -76,6 +76,7 @@ public abstract class ContentStreamBase
sourceInfo = "url";
}
+ @Override
public InputStream getStream() throws IOException {
URLConnection conn = this.url.openConnection();
@@ -102,37 +103,33 @@ public abstract class ContentStreamBase
sourceInfo = file.toURI().toString();
}
+ @Override
public String getContentType() {
if(contentType==null) {
+ InputStream stream = null;
try {
- char first = (char)new FileInputStream( file ).read();
+ stream = new FileInputStream(file);
+ char first = (char)stream.read();
if(first == '<') {
return "application/xml";
}
if(first == '{') {
return "application/json";
}
+ } catch(Exception ex) {
+ } finally {
+ if (stream != null) try {
+ stream.close();
+ } catch (IOException ioe) {}
}
- catch(Exception ex) {}
}
return contentType;
}
+ @Override
public InputStream getStream() throws IOException {
return new FileInputStream( file );
}
-
- /**
- * If an charset is defined (by the contentType) use that, otherwise
- * use a UTF-8 reader
- */
- @Override
- public Reader getReader() throws IOException {
- String charset = getCharsetFromContentType( contentType );
- return charset == null
- ? new InputStreamReader(getStream(), "UTF-8")
- : new InputStreamReader( getStream(), charset );
- }
}
@@ -152,6 +149,7 @@ public abstract class ContentStreamBase
sourceInfo = "string";
}
+ @Override
public String getContentType() {
if(contentType==null && str.length() > 0) {
char first = str.charAt(0);
@@ -166,6 +164,7 @@ public abstract class ContentStreamBase
return contentType;
}
+ @Override
public InputStream getStream() throws IOException {
return new ByteArrayInputStream( str.getBytes(DEFAULT_CHARSET) );
}
Modified: lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java (original)
+++ lucene/dev/branches/lucene3312/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java Sun Aug 19 09:35:25 2012
@@ -57,8 +57,8 @@ public class FastOutputStream extends Ou
public void write(byte b) throws IOException {
if (pos >= buf.length) {
- flush(buf, 0, buf.length);
written += pos;
+ flush(buf, 0, buf.length);
pos=0;
}
buf[pos++] = b;
@@ -66,29 +66,40 @@ public class FastOutputStream extends Ou
@Override
public void write(byte arr[], int off, int len) throws IOException {
- int space = buf.length - pos;
- if (len < space) {
- System.arraycopy(arr, off, buf, pos, len);
- pos += len;
- } else if (len<buf.length) {
- // if the data to write is small enough, buffer it.
+
+ for(;;) {
+ int space = buf.length - pos;
+
+ if (len <= space) {
+ System.arraycopy(arr, off, buf, pos, len);
+ pos += len;
+ return;
+ } else if (len > buf.length) {
+ if (pos>0) {
+ flush(buf,0,pos); // flush
+ written += pos;
+ pos=0;
+ }
+ // don't buffer, just write to sink
+ flush(arr, off, len);
+ written += len;
+ return;
+ }
+
+ // buffer is too big to fit in the free space, but
+ // not big enough to warrant writing on its own.
+ // write whatever we can fit, then flush and iterate.
+
System.arraycopy(arr, off, buf, pos, space);
+ written += buf.length; // important to do this first, since buf.length can change after a flush!
flush(buf, 0, buf.length);
- written += buf.length;
- pos = len-space;
- System.arraycopy(arr, off+space, buf, 0, pos);
- } else {
- if (pos>0) {
- flush(buf,0,pos); // flush
- written += pos;
- pos=0;
- }
- // don't buffer, just write to sink
- flush(arr, off, len);
- written += len;
+ pos = 0;
+ off += space;
+ len -= space;
}
}
+
/** reserve at least len bytes at the end of the buffer.
* Invalid if len > buffer.length
* @param len
@@ -182,8 +193,8 @@ public class FastOutputStream extends Ou
*/
public void flushBuffer() throws IOException {
if (pos > 0) {
- flush(buf, 0, pos);
written += pos;
+ flush(buf, 0, pos);
pos=0;
}
}
Modified: lucene/dev/branches/lucene3312/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java (original)
+++ lucene/dev/branches/lucene3312/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java Sun Aug 19 09:35:25 2012
@@ -17,22 +17,16 @@
package org.apache.solr.common.util;
-import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.StringReader;
-import java.net.ConnectException;
-import java.net.HttpURLConnection;
import java.net.URL;
-import java.net.URLConnection;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.core.SolrResourceLoader;
/**
@@ -66,50 +60,17 @@ public class ContentStreamTest extends L
public void testURLStream() throws IOException
{
- byte[] content = null;
- String contentType = null;
- URL url = new URL( "http://svn.apache.org/repos/asf/lucene/dev/trunk/" );
- InputStream in = null;
- try {
- HttpURLConnection conn = (HttpURLConnection)url.openConnection();
- conn.setConnectTimeout(1000);
- conn.setReadTimeout(1000);
- conn.connect();
- int code = conn.getResponseCode();
- assumeTrue("wrong response code from server: " + code, 200 == code);
- in = conn.getInputStream();
- contentType = conn.getContentType();
- content = IOUtils.toByteArray(in);
-
- assumeTrue("not enough content for test to be useful",
- content.length > 10 );
-
- } catch (IOException ex) {
- assumeNoException("Unable to connect to " + url + " to run the test.", ex);
- }finally {
- if (in != null) {
- IOUtils.closeQuietly(in);
- }
- }
-
-
- ContentStreamBase stream = new ContentStreamBase.URLStream( url );
- in = stream.getStream(); // getStream is needed before getSize is valid
- assertEquals( content.length, stream.getSize().intValue() );
+ InputStream is = new SolrResourceLoader(null, null).openResource( "solrj/README" );
+ assertNotNull( is );
+ File file = new File(TEMP_DIR, "README");
+ FileOutputStream os = new FileOutputStream(file);
+ IOUtils.copy(is, os);
+ os.close();
- try {
- assertTrue( IOUtils.contentEquals(
- new ByteArrayInputStream(content), in ) );
- }
- finally {
- IOUtils.closeQuietly(in);
- }
-
- String charset = ContentStreamBase.getCharsetFromContentType(contentType);
- if (charset == null)
- charset = ContentStreamBase.DEFAULT_CHARSET;
- // Re-open the stream and this time use a reader
- stream = new ContentStreamBase.URLStream( url );
- assertTrue( IOUtils.contentEquals( new StringReader(new String(content, charset)), stream.getReader() ) );
+ ContentStreamBase stream = new ContentStreamBase.URLStream( new URL(file.toURI().toASCIIString()) );
+ assertTrue( IOUtils.contentEquals( new FileInputStream( file ), stream.getStream() ) );
+ assertEquals( file.length(), stream.getSize().intValue() );
+ assertTrue( IOUtils.contentEquals( new InputStreamReader(new FileInputStream(file), "UTF-8"), stream.getReader() ) );
+ assertEquals( file.length(), stream.getSize().intValue() );
}
}
Modified: lucene/dev/branches/lucene3312/solr/test-framework/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/test-framework/build.xml?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/test-framework/build.xml (original)
+++ lucene/dev/branches/lucene3312/solr/test-framework/build.xml Sun Aug 19 09:35:25 2012
@@ -20,10 +20,14 @@
<import file="../common-build.xml"/>
- <path id="javadoc.classpath">
- <path refid="test.classpath"/>
- <pathelement location="${common.dir}/build/test-framework/classes/java"/>
- <pathelement location="${build.dir}/classes/java"/>
+ <path id="solr.test.framework.lucene.libs">
+ <pathelement location="${test-framework.jar}"/>
+ </path>
+
+ <path id="classpath">
+ <fileset dir="lib" excludes="${common.classpath.excludes}"/>
+ <path refid="solr.test.framework.lucene.libs" />
+ <path refid="solr.base.classpath"/>
</path>
<!-- Redefine Lucene test-framework compilation here to avoid circular dependency on compile-core -->
@@ -39,19 +43,26 @@
</compile>
</target>
- <!-- Override common-solr.javadocs to include JUnit,test-framework links -->
+ <!-- Override common-solr.javadocs to include JUnit links -->
<!-- and to copy the built javadocs to ${dest}/docs/api/test-framework -->
<target name="javadocs"
- depends="compile-core,lucene-javadocs,javadocs-test-framework,define-lucene-javadoc-url">
+ depends="compile-core,jar-test-framework,lucene-javadocs,javadocs-test-framework,define-lucene-javadoc-url">
<sequential>
<mkdir dir="${javadoc.dir}"/>
+ <!-- NOTE: explicitly not using solr-invoke-javadoc, or attempting to
+ link to lucene-test-framework because if we did javadoc would
+ attempt to link class refs in in org.apache.lucene, causing
+ broken links. (either broken links to things like "Directory" if
+ lucene-test-framework was first, or broken links to things like
+ LuceneTestCase if lucene-core was first)
+ -->
<invoke-javadoc destdir="${javadoc.dir}"
- title="${Name}" overview="${src.dir}/overview.html">
- <sources>
- <link offline="true" href="${javadoc.link.junit}"
- packagelistLoc="${javadoc.packagelist.dir}/junit"/>
- <packageset dir="${src.dir}"/>
- </sources>
+ title="${Name} ${version} Test Framework API">
+ <sources>
+ <link offline="true" href="${javadoc.link.junit}"
+ packagelistLoc="${javadoc.packagelist.dir}/junit"/>
+ <packageset dir="${src.dir}"/>
+ </sources>
</invoke-javadoc>
<solr-jarify basedir="${javadoc.dir}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
<mkdir dir="${dest}/docs/api/test-framework"/>
@@ -60,5 +71,28 @@
</copy>
</sequential>
</target>
+
+ <target name="module-jars-to-solr"
+ depends="jar-test-framework">
+ <mkdir dir="${build.dir}/lucene-libs"/>
+ <copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
+ <path refid="solr.test.framework.lucene.libs" />
+ </copy>
+ </target>
+
+ <target name="dist" depends="module-jars-to-solr, common-solr.dist">
+ <!-- we're not a contrib, our lucene-libs and go in a special place -->
+ <mkdir dir="${dist}/test-framework" />
+ <copy todir="${dist}/test-framework">
+ <fileset dir="${build.dir}">
+ <include name="lucene-libs/*.jar" />
+ </fileset>
+ <fileset dir=".">
+ <include name="lib/*" />
+ <include name="README.txt" />
+ </fileset>
+ </copy>
+ </target>
+
</project>
Modified: lucene/dev/branches/lucene3312/solr/test-framework/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/test-framework/ivy.xml?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/test-framework/ivy.xml (original)
+++ lucene/dev/branches/lucene3312/solr/test-framework/ivy.xml Sun Aug 19 09:35:25 2012
@@ -17,5 +17,25 @@
under the License.
-->
<ivy-module version="2.0">
- <info organisation="org.apache.solr" module="core-test-framework"/>
+ <info organisation="org.apache.solr" module="solr-test-framework"/>
+
+ <configurations>
+ <conf name="default" />
+ <!--
+ JUnit4 ANT task only, no ANT.
+ This is used from build scripts for taskdefs.
+ -->
+ <conf name="junit4-stdalone" />
+ </configurations>
+
+ <dependencies defaultconf="default">
+ <dependency org="org.apache.ant" name="ant" rev="1.8.2" transitive="false" />
+ <dependency org="org.apache.ant" name="ant-junit" rev="1.8.2" transitive="false" />
+
+ <dependency org="junit" name="junit" rev="4.10" transitive="false" conf="default->*;junit4-stdalone->*" />
+ <dependency org="com.carrotsearch.randomizedtesting" name="junit4-ant" rev="2.0.0.rc5" transitive="false" conf="default->*;junit4-stdalone->*" />
+ <dependency org="com.carrotsearch.randomizedtesting" name="randomizedtesting-runner" rev="2.0.0.rc5" transitive="false" conf="default->*;junit4-stdalone->*" />
+
+ <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
+ </dependencies>
</ivy-module>
Modified: lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockDirectoryFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockDirectoryFactory.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockDirectoryFactory.java (original)
+++ lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockDirectoryFactory.java Sun Aug 19 09:35:25 2012
@@ -32,10 +32,12 @@ public class MockDirectoryFactory extend
@Override
protected Directory create(String path) throws IOException {
Directory dir = LuceneTestCase.newDirectory();
- // Somehow removing unref'd files in Solr tests causes
- // problems... there's some interaction w/
- // CachingDirectoryFactory. Once we track down where Solr
- // isn't closing an IW, we can re-enable this:
+ // we can't currently do this check because of how
+ // Solr has to reboot a new Directory sometimes when replicating
+ // or rolling back - the old directory is closed and the following
+ // test assumes it can open an IndexWriter when that happens - we
+ // have a new Directory for the same dir and still an open IW at
+ // this point
if (dir instanceof MockDirectoryWrapper) {
((MockDirectoryWrapper)dir).setAssertNoUnrefencedFilesOnClose(false);
}
Modified: lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockFSDirectoryFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockFSDirectoryFactory.java?rev=1374718&r1=1374717&r2=1374718&view=diff
==============================================================================
--- lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockFSDirectoryFactory.java (original)
+++ lucene/dev/branches/lucene3312/solr/test-framework/src/java/org/apache/solr/core/MockFSDirectoryFactory.java Sun Aug 19 09:35:25 2012
@@ -32,10 +32,12 @@ public class MockFSDirectoryFactory exte
@Override
public Directory create(String path) throws IOException {
Directory dir = LuceneTestCase.newFSDirectory(new File(path));
- // Somehow removing unref'd files in Solr tests causes
- // problems... there's some interaction w/
- // CachingDirectoryFactory. Once we track down where Solr
- // isn't closing an IW, we can re-enable this:
+ // we can't currently do this check because of how
+ // Solr has to reboot a new Directory sometimes when replicating
+ // or rolling back - the old directory is closed and the following
+ // test assumes it can open an IndexWriter when that happens - we
+ // have a new Directory for the same dir and still an open IW at
+ // this point
if (dir instanceof MockDirectoryWrapper) {
((MockDirectoryWrapper)dir).setAssertNoUnrefencedFilesOnClose(false);
}