You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2011/06/24 17:35:12 UTC

svn commit: r1139357 - in /nutch/branches/branch-1.4: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrClean.java src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java src/java/org/apache/nutch/indexer/solr/SolrIndexer.java

Author: markus
Date: Fri Jun 24 15:35:12 2011
New Revision: 1139357

URL: http://svn.apache.org/viewvc?rev=1139357&view=rev
Log:
NUTCH-1000 Add option not to commit to Solr

Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
    nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
    nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Fri Jun 24 15:35:12 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1000 Add option not to commit to Solr (markus)
+
 * NUTCH-1006 MetaEquiv with single quotes not accepted (markus)
 
 * NUTCH-1010 ContentLength not trimmed (markus)

Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java Fri Jun 24 15:35:12 2011
@@ -97,11 +97,13 @@ public class SolrClean implements Tool {
     private int totalDeleted = 0;
     private SolrServer solr;
     private UpdateRequest updateRequest = new UpdateRequest();
+    private boolean noCommit = false;
 
     @Override
     public void configure(JobConf job) {
       try {
         solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+        noCommit = job.getBoolean("noCommit", false);
       } catch (MalformedURLException e) {
         throw new RuntimeException(e);
       }
@@ -116,7 +118,7 @@ public class SolrClean implements Tool {
           totalDeleted += numDeletes;
         }
 
-        if (totalDeleted > 0) {
+        if (totalDeleted > 0 && !noCommit) {
           solr.commit();
         }
 
@@ -149,7 +151,7 @@ public class SolrClean implements Tool {
     }
   }
 
-  public void delete(String crawldb, String solrUrl) throws IOException {
+  public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("SolrClean: starting at " + sdf.format(start));
@@ -157,6 +159,7 @@ public class SolrClean implements Tool {
     JobConf job = new NutchJob(getConf());
 
     FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
+    job.setBoolean("noCommit", noCommit);
     job.set(SolrConstants.SERVER_URL, solrUrl);
     job.setInputFormat(SequenceFileInputFormat.class);
     job.setOutputFormat(NullOutputFormat.class);
@@ -172,12 +175,17 @@ public class SolrClean implements Tool {
   }
 
   public int run(String[] args) throws IOException {
-    if (args.length != 2) {
-      System.err.println("Usage: SolrClean <crawldb> <solrurl>");
+    if (args.length < 2) {
+      System.err.println("Usage: SolrClean <crawldb> <solrurl> [-noCommit]");
       return 1;
     }
 
-    delete(args[0], args[1]);
+    boolean noCommit = false;
+    if (args.length == 3 && args[2].equals("-noCommit")) {
+      noCommit = true;
+    }
+
+    delete(args[0], args[1], noCommit);
 
     return 0;
   }

Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Fri Jun 24 15:35:12 2011
@@ -282,6 +282,8 @@ Tool {
 
   private SolrServer solr;
 
+  private boolean noCommit = false;
+
   private int numDeletes = 0;
 
   private UpdateRequest updateRequest = new UpdateRequest();
@@ -297,6 +299,7 @@ Tool {
   public void configure(JobConf job) {
     try {
       solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+      noCommit = job.getBoolean("noCommit", false);
     } catch (MalformedURLException e) {
       throw new RuntimeException(e);
     }
@@ -308,7 +311,10 @@ Tool {
       if (numDeletes > 0) {
         LOG.info("SolrDeleteDuplicates: deleting " + numDeletes + " duplicates");
         updateRequest.process(solr);
-        solr.commit();
+
+        if (!noCommit) {
+          solr.commit();
+        }
       }
     } catch (SolrServerException e) {
       throw new IOException(e);
@@ -343,7 +349,7 @@ Tool {
     }
   }
 
-  public void dedup(String solrUrl) throws IOException {
+  public void dedup(String solrUrl, boolean noCommit) throws IOException {
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("SolrDeleteDuplicates: starting at " + sdf.format(start));
@@ -352,6 +358,7 @@ Tool {
     JobConf job = new NutchJob(getConf());
 
     job.set(SolrConstants.SERVER_URL, solrUrl);
+    job.setBoolean("noCommit", noCommit);
     job.setInputFormat(SolrInputFormat.class);
     job.setOutputFormat(NullOutputFormat.class);
     job.setMapOutputKeyClass(Text.class);
@@ -366,12 +373,17 @@ Tool {
   }
 
   public int run(String[] args) throws IOException {
-    if (args.length != 1) {
-      System.err.println("Usage: SolrDeleteDuplicates <solr url>");
+    if (args.length < 1) {
+      System.err.println("Usage: SolrDeleteDuplicates <solr url> [-noCommit]");
       return 1;
     }
 
-    dedup(args[0]);
+    boolean noCommit = false;
+    if (args.length == 2 && args[1].equals("-noCommit")) {
+      noCommit = true;
+    }
+
+    dedup(args[0], noCommit);
     return 0;
   }
 

Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1139357&r1=1139356&r2=1139357&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Fri Jun 24 15:35:12 2011
@@ -57,7 +57,7 @@ public class SolrIndexer extends Configu
   }
 
   public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
-      List<Path> segments) throws IOException {
+      List<Path> segments, boolean noCommit) throws IOException {
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("SolrIndexer: starting at " + sdf.format(start));
@@ -68,7 +68,6 @@ public class SolrIndexer extends Configu
     IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
 
     job.set(SolrConstants.SERVER_URL, solrUrl);
-
     NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);
 
     job.setReduceSpeculativeExecution(false);
@@ -81,7 +80,10 @@ public class SolrIndexer extends Configu
       JobClient.runJob(job);
       // do the commits once and for all the reducers in one go
       SolrServer solr =  new CommonsHttpSolrServer(solrUrl);
-      solr.commit();
+
+      if (!noCommit) {
+        solr.commit();
+      }
       long end = System.currentTimeMillis();
       LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
     }
@@ -94,7 +96,7 @@ public class SolrIndexer extends Configu
 
   public int run(String[] args) throws Exception {
     if (args.length < 4) {
-      System.err.println("Usage: SolrIndexer <solr url> <crawldb> <linkdb> (<segment> ... | -dir <segments>)");
+      System.err.println("Usage: SolrIndexer <solr url> <crawldb> <linkdb> (<segment> ... | -dir <segments>) [-noCommit]");
       return -1;
     }
 
@@ -102,6 +104,9 @@ public class SolrIndexer extends Configu
     final Path linkDb = new Path(args[2]);
 
     final List<Path> segments = new ArrayList<Path>();
+
+    boolean noCommit = false;
+
     for (int i = 3; i < args.length; i++) {
       if (args[i].equals("-dir")) {
         Path dir = new Path(args[++i]);
@@ -112,13 +117,15 @@ public class SolrIndexer extends Configu
         for (Path p : files) {
           segments.add(p);
         }
+      } else if (args[i].equals("-noCommit")) {
+        noCommit = true;
       } else {
         segments.add(new Path(args[i]));
       }
     }
 
     try {
-      indexSolr(args[0], crawlDb, linkDb, segments);
+      indexSolr(args[0], crawlDb, linkDb, segments, noCommit);
       return 0;
     } catch (final Exception e) {
       LOG.fatal("SolrIndexer: " + StringUtils.stringifyException(e));