You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2011/06/24 17:35:12 UTC
svn commit: r1139357 - in /nutch/branches/branch-1.4: CHANGES.txt
src/java/org/apache/nutch/indexer/solr/SolrClean.java
src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
Author: markus
Date: Fri Jun 24 15:35:12 2011
New Revision: 1139357
URL: http://svn.apache.org/viewvc?rev=1139357&view=rev
Log:
NUTCH-1000 Add option not to commit to Solr
Modified:
nutch/branches/branch-1.4/CHANGES.txt
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Fri Jun 24 15:35:12 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-1000 Add option not to commit to Solr (markus)
+
* NUTCH-1006 MetaEquiv with single quotes not accepted (markus)
* NUTCH-1010 ContentLength not trimmed (markus)
Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java Fri Jun 24 15:35:12 2011
@@ -97,11 +97,13 @@ public class SolrClean implements Tool {
private int totalDeleted = 0;
private SolrServer solr;
private UpdateRequest updateRequest = new UpdateRequest();
+ private boolean noCommit = false;
@Override
public void configure(JobConf job) {
try {
solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ noCommit = job.getBoolean("noCommit", false);
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
@@ -116,7 +118,7 @@ public class SolrClean implements Tool {
totalDeleted += numDeletes;
}
- if (totalDeleted > 0) {
+ if (totalDeleted > 0 && !noCommit) {
solr.commit();
}
@@ -149,7 +151,7 @@ public class SolrClean implements Tool {
}
}
- public void delete(String crawldb, String solrUrl) throws IOException {
+ public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("SolrClean: starting at " + sdf.format(start));
@@ -157,6 +159,7 @@ public class SolrClean implements Tool {
JobConf job = new NutchJob(getConf());
FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
+ job.setBoolean("noCommit", noCommit);
job.set(SolrConstants.SERVER_URL, solrUrl);
job.setInputFormat(SequenceFileInputFormat.class);
job.setOutputFormat(NullOutputFormat.class);
@@ -172,12 +175,17 @@ public class SolrClean implements Tool {
}
public int run(String[] args) throws IOException {
- if (args.length != 2) {
- System.err.println("Usage: SolrClean <crawldb> <solrurl>");
+ if (args.length < 2) {
+ System.err.println("Usage: SolrClean <crawldb> <solrurl> [-noCommit]");
return 1;
}
- delete(args[0], args[1]);
+ boolean noCommit = false;
+ if (args.length == 3 && args[2].equals("-noCommit")) {
+ noCommit = true;
+ }
+
+ delete(args[0], args[1], noCommit);
return 0;
}
Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Fri Jun 24 15:35:12 2011
@@ -282,6 +282,8 @@ Tool {
private SolrServer solr;
+ private boolean noCommit = false;
+
private int numDeletes = 0;
private UpdateRequest updateRequest = new UpdateRequest();
@@ -297,6 +299,7 @@ Tool {
public void configure(JobConf job) {
try {
solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ noCommit = job.getBoolean("noCommit", false);
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
@@ -308,7 +311,10 @@ Tool {
if (numDeletes > 0) {
LOG.info("SolrDeleteDuplicates: deleting " + numDeletes + " duplicates");
updateRequest.process(solr);
- solr.commit();
+
+ if (!noCommit) {
+ solr.commit();
+ }
}
} catch (SolrServerException e) {
throw new IOException(e);
@@ -343,7 +349,7 @@ Tool {
}
}
- public void dedup(String solrUrl) throws IOException {
+ public void dedup(String solrUrl, boolean noCommit) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("SolrDeleteDuplicates: starting at " + sdf.format(start));
@@ -352,6 +358,7 @@ Tool {
JobConf job = new NutchJob(getConf());
job.set(SolrConstants.SERVER_URL, solrUrl);
+ job.setBoolean("noCommit", noCommit);
job.setInputFormat(SolrInputFormat.class);
job.setOutputFormat(NullOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
@@ -366,12 +373,17 @@ Tool {
}
public int run(String[] args) throws IOException {
- if (args.length != 1) {
- System.err.println("Usage: SolrDeleteDuplicates <solr url>");
+ if (args.length < 1) {
+ System.err.println("Usage: SolrDeleteDuplicates <solr url> [-noCommit]");
return 1;
}
- dedup(args[0]);
+ boolean noCommit = false;
+ if (args.length == 2 && args[1].equals("-noCommit")) {
+ noCommit = true;
+ }
+
+ dedup(args[0], noCommit);
return 0;
}
Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Fri Jun 24 15:35:12 2011
@@ -57,7 +57,7 @@ public class SolrIndexer extends Configu
}
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
- List<Path> segments) throws IOException {
+ List<Path> segments, boolean noCommit) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("SolrIndexer: starting at " + sdf.format(start));
@@ -68,7 +68,6 @@ public class SolrIndexer extends Configu
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
job.set(SolrConstants.SERVER_URL, solrUrl);
-
NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);
job.setReduceSpeculativeExecution(false);
@@ -81,7 +80,10 @@ public class SolrIndexer extends Configu
JobClient.runJob(job);
// do the commits once and for all the reducers in one go
SolrServer solr = new CommonsHttpSolrServer(solrUrl);
- solr.commit();
+
+ if (!noCommit) {
+ solr.commit();
+ }
long end = System.currentTimeMillis();
LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
@@ -94,7 +96,7 @@ public class SolrIndexer extends Configu
public int run(String[] args) throws Exception {
if (args.length < 4) {
- System.err.println("Usage: SolrIndexer <solr url> <crawldb> <linkdb> (<segment> ... | -dir <segments>)");
+ System.err.println("Usage: SolrIndexer <solr url> <crawldb> <linkdb> (<segment> ... | -dir <segments>) [-noCommit]");
return -1;
}
@@ -102,6 +104,9 @@ public class SolrIndexer extends Configu
final Path linkDb = new Path(args[2]);
final List<Path> segments = new ArrayList<Path>();
+
+ boolean noCommit = false;
+
for (int i = 3; i < args.length; i++) {
if (args[i].equals("-dir")) {
Path dir = new Path(args[++i]);
@@ -112,13 +117,15 @@ public class SolrIndexer extends Configu
for (Path p : files) {
segments.add(p);
}
+ } else if (args[i].equals("-noCommit")) {
+ noCommit = true;
} else {
segments.add(new Path(args[i]));
}
}
try {
- indexSolr(args[0], crawlDb, linkDb, segments);
+ indexSolr(args[0], crawlDb, linkDb, segments, noCommit);
return 0;
} catch (final Exception e) {
LOG.fatal("SolrIndexer: " + StringUtils.stringifyException(e));