You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by danomano <ds...@earthlink.net> on 2011/03/08 23:17:46 UTC

Solr Hanging all of sudden with update/csv

Hi folks, I've been using solr for about 3 months.

Our Solr install is a single node, and we have been injecting logging data
into the solr server every couple of minutes, which each updating taking few
minutes.

Everything working fine until this morning, at which point it appeared that
all updates were hung.

Retarting the solr server did not help, as all updaters immediately 'hung'
again.

Poking around in the threads, and strace, I do in fact see stuff happening.

The index size itself is about 270Gb, (we are hopping to support upto
500-1TB), and have supplied the system with ~3TB diskspace.

Any Tips on what could be happening?
notes: we have never run an optimize yet.
          we have never deleted from system yet.


The merge Thread appears to be the one..'never returnning'
"Lucene Merge Thread #0" - Thread t@41
   java.lang.Thread.State: RUNNABLE
	at sun.nio.ch.FileDispatcher.pread0(Native Method)
	at sun.nio.ch.FileDispatcher.pread(FileDispatcher.java:31)
	at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:234)
	at sun.nio.ch.IOUtil.read(IOUtil.java:210)
	at sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:622)
	at
org.apache.lucene.store.NIOFSDirectory$NIOFSIndexInput.readInternal(NIOFSDirectory.java:161)
	at
org.apache.lucene.store.BufferedIndexInput.readBytes(BufferedIndexInput.java:139)
	at
org.apache.lucene.store.BufferedIndexInput.readBytes(BufferedIndexInput.java:94)
	at org.apache.lucene.store.DataOutput.copyBytes(DataOutput.java:176)
	at
org.apache.lucene.index.FieldsWriter.addRawDocuments(FieldsWriter.java:209)
	at
org.apache.lucene.index.SegmentMerger.copyFieldsNoDeletions(SegmentMerger.java:424)
	at
org.apache.lucene.index.SegmentMerger.mergeFields(SegmentMerger.java:332)
	at org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:153)
	at org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:4053)
	at org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:3645)
	at
org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:339)
	at
org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:407)


Some ptrace output:
23178 pread(172,
"\270\316\276\2\245\371\274\2\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2"...,
4096, 98004192) = 4096 <0.000009>
23178 pread(172,
"\245\371\274\2\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2"...,
4096, 98004196) = 4096 <0.000009>
23178 pread(172,
"\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2"...,
4096, 98004200) = 4096 <0.000008>
23178 pread(172,
"\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2"...,
4096, 98004204) = 4096 <0.000008>
23178 pread(172,
"\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2"...,
4096, 98004208) = 4096 <0.000008>
23178 pread(172,
"\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2"...,
4096, 98004212) = 4096 <0.000009>
23178 pread(172,
"\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2"...,
4096, 98004216) = 4096 <0.000008>
23178 pread(172,
"\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2"...,
4096, 98004220) = 4096 <0.000009>
23178 pread(172,
"\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2"...,
4096, 98004224) = 4096 <0.000013>
22688 <... futex resumed> )             = -1 ETIMEDOUT (Connection timed
out) <0.051276>
23178 pread(172,
"\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2"...,
4096, 98004228) = 4096 <0.000010>
22688 futex(0x464a9f28, FUTEX_WAKE_PRIVATE, 1 
23178 pread(172,
"\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2"...,
4096, 98004232) = 4096 <0.000010>
22688 <... futex resumed> )             = 0 <0.000051>
23178 pread(172,
"\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2"...,
4096, 98004236) = 4096 <0.000010>
22688 clock_gettime(CLOCK_MONOTONIC,  
23178 pread(172,
"\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2"...,
4096, 98004240) = 4096 <0.000010>
22688 <... clock_gettime resumed> {1900472, 454038316}) = 0 <0.000054>
23178 pread(172,
"\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2"...,
4096, 98004244) = 4096 <0.000011>
22688 clock_gettime(CLOCK_MONOTONIC,  
23178 pread(172,
"\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2"...,
4096, 98004248) = 4096 <0.000010>
22688 <... clock_gettime resumed> {1900472, 454169316}) = 0 <0.000051>
23178 pread(172,
"\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2"...,
4096, 98004252) = 4096 <0.000010>
22688 clock_gettime(CLOCK_MONOTONIC,  
23178 pread(172,
"\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2"...,
4096, 98004256) = 4096 <0.000011>
22688 <... clock_gettime resumed> {1900472, 454290316}) = 0 <0.000049>
23178 pread(172,
"\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2"...,
4096, 98004260) = 4096 <0.000010>
22688 clock_gettime(CLOCK_REALTIME,  
23178 pread(172,
"\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2"...,
4096, 98004264) = 4096 <0.000010>
22688 <... clock_gettime resumed> {1299621913, 884373000}) = 0 <0.000050>
23178 pread(172,
"\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2"...,
4096, 98004268) = 4096 <0.000010>
22688 futex(0x2aac7406ae34, FUTEX_WAIT_PRIVATE, 1, {0, 49938000} 
23178 pread(172,
"\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2"...,
4096, 98004272) = 4096 <0.000008>
23178 pread(172,
"\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2"...,
4096, 98004276) = 4096 <0.000009>
23178 pread(172,
"\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2"...,
4096, 98004280) = 4096 <0.000008>
23178 pread(172,
"\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2\322\316\276\2"...,
4096, 98004284) = 4096 <0.000009>
23178 pread(172,
"\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2\322\316\276\2\323\316\276\2"...,
4096, 98004288) = 40



Thanks
Dan





--
View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2652903.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Solr Hanging all of sudden with update/csv

Posted by Jason Rutherglen <ja...@gmail.com>.
You will need to cap the maximum segment size using
LogByteSizeMergePolicy.setMaxMergeMB.  As then you will only have
segments that are of an optimal size, and Lucene will not try to
create gigantic segments.  I think though on the query side you will
run out of heap space due to the terms index size.  What version are
you using?

On Wed, Mar 9, 2011 at 10:17 AM, danomano <ds...@earthlink.net> wrote:
> After About 4-5 hours the merge completed (ran out of heap)..as you
> suggested..it was having memory issues..
>
> Read queries during the merge were working just fine (they were taking
> longer then normal ~30-60seconds).
>
> I think I need to do more reading on understanding the merge/optimization
> processes.
>
> I am beginning to think what I need to do is have lots of segments? (i.e.
> frequent merges..of smaller sized segments, wouldn't that speed up the
> merging process when it actually runs?).
>
> A couple things I'm trying to wrap my ahead around:
>
> Increasing the segments will improve indexing speed on the whole.
> The question I have is: when it needs to actually perform a merge: will
> having more segments be better  (i.e. make the merge process faster)? or
> longer? ..having a 4 hour merge aka (indexing request) is not really
> acceptable (unless I can control when that merge happens).
>
> We are using our Solr server differently then most: Frequent Inserts (in
> batches), with few Reads.
>
> I would say having a 'long' query time is acceptable (say ~60 seconds).
>
>
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2656457.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>

Re: Solr Hanging all of sudden with update/csv

Posted by danomano <ds...@earthlink.net>.
Sweet, those links very very useful :).

and should most definitely help :)

One overriding concern I have:
1) if I were to simply update the config to use a different mergeFactor, and
restart the solr server, (would it then adjust the segments accordingly?) or
would I need to start from scratch..(i.e. re-index all the data).

2) like above, should I chose a new mergingPolicy (such as Zoie),I take I
would need to rebuild the entire index from scratch?

I suspect I'm going to have to bullet, and do all from scratch again :(
Is there a perhaps a tool that enables taking once index reprocessing it
again by forwarding all the data into another solr instance?





--
View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2666417.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Solr Hanging all of sudden with update/csv

Posted by Otis Gospodnetic <ot...@yahoo.com>.
Hi,

You'll benefit from watching this segment merging video:
  http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html

And you'll appreciate the graph at the bottom:
  http://code.google.com/p/zoie/wiki/ZoieMergePolicy

Otis
----
Sematext :: http://sematext.com/ :: Solr - Lucene - Nutch
Lucene ecosystem search :: http://search-lucene.com/



----- Original Message ----
> From: danomano <ds...@earthlink.net>
> To: solr-user@lucene.apache.org
> Sent: Wed, March 9, 2011 1:17:08 PM
> Subject: Re: Solr Hanging all of sudden with update/csv
> 
> After About 4-5 hours the merge completed (ran out of heap)..as  you
> suggested..it was having memory issues..
> 
> Read queries during the  merge were working just fine (they were taking
> longer then normal  ~30-60seconds).
> 
> I think I need to do more reading on understanding the  merge/optimization
> processes.
> 
> I am beginning to think what I need to  do is have lots of segments? (i.e.
> frequent merges..of smaller sized  segments, wouldn't that speed up the
> merging process when it actually  runs?).
> 
> A couple things I'm trying to wrap my ahead  around:
> 
> Increasing the segments will improve indexing speed on the  whole.
> The question I have is: when it needs to actually perform a merge:  will
> having more segments be better  (i.e. make the merge process  faster)? or
> longer? ..having a 4 hour merge aka (indexing request) is not  really
> acceptable (unless I can control when that merge happens).
> 
> We  are using our Solr server differently then most: Frequent Inserts  (in
> batches), with few Reads.
> 
> I would say having a 'long' query time  is acceptable (say ~60 seconds).
> 
> 
> 
> 
> 
> --
> View this message  in context: 
>http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2656457.html
>
> Sent  from the Solr - User mailing list archive at Nabble.com.
> 

Re: Solr Hanging all of sudden with update/csv

Posted by danomano <ds...@earthlink.net>.
After About 4-5 hours the merge completed (ran out of heap)..as you
suggested..it was having memory issues..

Read queries during the merge were working just fine (they were taking
longer then normal ~30-60seconds).

I think I need to do more reading on understanding the merge/optimization
processes.

I am beginning to think what I need to do is have lots of segments? (i.e.
frequent merges..of smaller sized segments, wouldn't that speed up the
merging process when it actually runs?).

A couple things I'm trying to wrap my ahead around:

Increasing the segments will improve indexing speed on the whole.
The question I have is: when it needs to actually perform a merge: will
having more segments be better  (i.e. make the merge process faster)? or
longer? ..having a 4 hour merge aka (indexing request) is not really
acceptable (unless I can control when that merge happens).

We are using our Solr server differently then most: Frequent Inserts (in
batches), with few Reads.

I would say having a 'long' query time is acceptable (say ~60 seconds).





--
View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2656457.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Solr Hanging all of sudden with update/csv

Posted by Jason Rutherglen <ja...@gmail.com>.
> The index size itself is about 270Gb, (we are hopping to support upto
> 500-1TB), and have supplied the system with ~3TB diskspace.

That's simply massive for a single node.  When the system tries to
merge the segments the queries are probably not working?  And the
merges will take quite a while.  How long is OK for a single query to
return in?

On Tue, Mar 8, 2011 at 2:17 PM, danomano <ds...@earthlink.net> wrote:
> Hi folks, I've been using solr for about 3 months.
>
> Our Solr install is a single node, and we have been injecting logging data
> into the solr server every couple of minutes, which each updating taking few
> minutes.
>
> Everything working fine until this morning, at which point it appeared that
> all updates were hung.
>
> Retarting the solr server did not help, as all updaters immediately 'hung'
> again.
>
> Poking around in the threads, and strace, I do in fact see stuff happening.
>
> The index size itself is about 270Gb, (we are hopping to support upto
> 500-1TB), and have supplied the system with ~3TB diskspace.
>
> Any Tips on what could be happening?
> notes: we have never run an optimize yet.
>          we have never deleted from system yet.
>
>
> The merge Thread appears to be the one..'never returnning'
> "Lucene Merge Thread #0" - Thread t@41
>   java.lang.Thread.State: RUNNABLE
>        at sun.nio.ch.FileDispatcher.pread0(Native Method)
>        at sun.nio.ch.FileDispatcher.pread(FileDispatcher.java:31)
>        at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:234)
>        at sun.nio.ch.IOUtil.read(IOUtil.java:210)
>        at sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:622)
>        at
> org.apache.lucene.store.NIOFSDirectory$NIOFSIndexInput.readInternal(NIOFSDirectory.java:161)
>        at
> org.apache.lucene.store.BufferedIndexInput.readBytes(BufferedIndexInput.java:139)
>        at
> org.apache.lucene.store.BufferedIndexInput.readBytes(BufferedIndexInput.java:94)
>        at org.apache.lucene.store.DataOutput.copyBytes(DataOutput.java:176)
>        at
> org.apache.lucene.index.FieldsWriter.addRawDocuments(FieldsWriter.java:209)
>        at
> org.apache.lucene.index.SegmentMerger.copyFieldsNoDeletions(SegmentMerger.java:424)
>        at
> org.apache.lucene.index.SegmentMerger.mergeFields(SegmentMerger.java:332)
>        at org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:153)
>        at org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:4053)
>        at org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:3645)
>        at
> org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:339)
>        at
> org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:407)
>
>
> Some ptrace output:
> 23178 pread(172,
> "\270\316\276\2\245\371\274\2\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2"...,
> 4096, 98004192) = 4096 <0.000009>
> 23178 pread(172,
> "\245\371\274\2\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2"...,
> 4096, 98004196) = 4096 <0.000009>
> 23178 pread(172,
> "\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2"...,
> 4096, 98004200) = 4096 <0.000008>
> 23178 pread(172,
> "\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2"...,
> 4096, 98004204) = 4096 <0.000008>
> 23178 pread(172,
> "\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2"...,
> 4096, 98004208) = 4096 <0.000008>
> 23178 pread(172,
> "\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2"...,
> 4096, 98004212) = 4096 <0.000009>
> 23178 pread(172,
> "\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2"...,
> 4096, 98004216) = 4096 <0.000008>
> 23178 pread(172,
> "\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2"...,
> 4096, 98004220) = 4096 <0.000009>
> 23178 pread(172,
> "\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2"...,
> 4096, 98004224) = 4096 <0.000013>
> 22688 <... futex resumed> )             = -1 ETIMEDOUT (Connection timed
> out) <0.051276>
> 23178 pread(172,
> "\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2"...,
> 4096, 98004228) = 4096 <0.000010>
> 22688 futex(0x464a9f28, FUTEX_WAKE_PRIVATE, 1
> 23178 pread(172,
> "\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2"...,
> 4096, 98004232) = 4096 <0.000010>
> 22688 <... futex resumed> )             = 0 <0.000051>
> 23178 pread(172,
> "\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2"...,
> 4096, 98004236) = 4096 <0.000010>
> 22688 clock_gettime(CLOCK_MONOTONIC,
> 23178 pread(172,
> "\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2"...,
> 4096, 98004240) = 4096 <0.000010>
> 22688 <... clock_gettime resumed> {1900472, 454038316}) = 0 <0.000054>
> 23178 pread(172,
> "\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2"...,
> 4096, 98004244) = 4096 <0.000011>
> 22688 clock_gettime(CLOCK_MONOTONIC,
> 23178 pread(172,
> "\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2"...,
> 4096, 98004248) = 4096 <0.000010>
> 22688 <... clock_gettime resumed> {1900472, 454169316}) = 0 <0.000051>
> 23178 pread(172,
> "\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2"...,
> 4096, 98004252) = 4096 <0.000010>
> 22688 clock_gettime(CLOCK_MONOTONIC,
> 23178 pread(172,
> "\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2"...,
> 4096, 98004256) = 4096 <0.000011>
> 22688 <... clock_gettime resumed> {1900472, 454290316}) = 0 <0.000049>
> 23178 pread(172,
> "\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2"...,
> 4096, 98004260) = 4096 <0.000010>
> 22688 clock_gettime(CLOCK_REALTIME,
> 23178 pread(172,
> "\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2"...,
> 4096, 98004264) = 4096 <0.000010>
> 22688 <... clock_gettime resumed> {1299621913, 884373000}) = 0 <0.000050>
> 23178 pread(172,
> "\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2"...,
> 4096, 98004268) = 4096 <0.000010>
> 22688 futex(0x2aac7406ae34, FUTEX_WAIT_PRIVATE, 1, {0, 49938000}
> 23178 pread(172,
> "\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2"...,
> 4096, 98004272) = 4096 <0.000008>
> 23178 pread(172,
> "\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2"...,
> 4096, 98004276) = 4096 <0.000009>
> 23178 pread(172,
> "\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2"...,
> 4096, 98004280) = 4096 <0.000008>
> 23178 pread(172,
> "\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2\322\316\276\2"...,
> 4096, 98004284) = 4096 <0.000009>
> 23178 pread(172,
> "\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2\322\316\276\2\323\316\276\2"...,
> 4096, 98004288) = 40
>
>
>
> Thanks
> Dan
>
>
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2652903.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>

Re: Solr Hanging all of sudden with update/csv

Posted by danomano <ds...@earthlink.net>.
Actually this is definitely not a ram issue.  I have visualVM connected and
MAX Ram available for the JavaVM is ~7GB, but the system is only using
~5.5GB, with a MAX so far of 6.5GB consumed.

I think..well I'm guessing the system hit a merge threshold, but I can't
tell for sure..I have seen the index size grow rapidly today (much more then
normal, in the last 3 hours the index size has increased by about 50%).  
>From various posts I see that during the 'optimize' (which I have not
called), or the perhaps the merging of segments it is normal for the disk
space requirements to temporarily increase by 2x to 3x.  As such my only
assumption is that it must be conducing a merge.  
Note: since I restarted the solr server, I have only 1 client thread pushing
data in (it already transmitted the data.(~2mb)). (and it has been held up
for about 4 hours now..I believe its stuck waiting for the merge thread to
complete).

Is there a better way to handle merging? or at least predicting when it will
occur? (I'm essentially using the defaults MergeFactor:10, ramBuffer 32MB).

I'm totally new to solr/lucune/indexing in generaly so I'm so what clueless
on all this..
It should be noted we have 'millions of documents' all which are generally <
4k bytes.




--
View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2653423.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Solr Hanging all of sudden with update/csv

Posted by Jonathan Rochkind <ro...@jhu.edu>.
My guess is that you're running out of RAM.  Actual Java profiling is 
beyond me, but I have seen issues on updating that were solved by more RAM.

If you are updating every few minutes, and your new index takes more 
than a few minutes to warm, you could be running into overlapping 
warming indexes issues. Some more info on what I mean by this in this 
FAQ, although the FAQ isn't actually targetted at this case exactly: 
http://wiki.apache.org/solr/FAQ#What_does_.22exceeded_limit_of_maxWarmingSearchers.3DX.22_mean.3F

Overlapping warming indexes can result in excessive RAM and/or CPU usage.

If you haven't given your JVM options to tune the JVM Garbage 
Collection, that can also help things, using the options for concurrent 
thread GC.  But if your fundamental problem is overlapping warming 
queries, you probably need to make that stop.

On 3/8/2011 5:17 PM, danomano wrote:
> Hi folks, I've been using solr for about 3 months.
>
> Our Solr install is a single node, and we have been injecting logging data
> into the solr server every couple of minutes, which each updating taking few
> minutes.
>
> Everything working fine until this morning, at which point it appeared that
> all updates were hung.
>
> Retarting the solr server did not help, as all updaters immediately 'hung'
> again.
>
> Poking around in the threads, and strace, I do in fact see stuff happening.
>
> The index size itself is about 270Gb, (we are hopping to support upto
> 500-1TB), and have supplied the system with ~3TB diskspace.
>
> Any Tips on what could be happening?
> notes: we have never run an optimize yet.
>            we have never deleted from system yet.
>
>
> The merge Thread appears to be the one..'never returnning'
> "Lucene Merge Thread #0" - Thread t@41
>     java.lang.Thread.State: RUNNABLE
> 	at sun.nio.ch.FileDispatcher.pread0(Native Method)
> 	at sun.nio.ch.FileDispatcher.pread(FileDispatcher.java:31)
> 	at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:234)
> 	at sun.nio.ch.IOUtil.read(IOUtil.java:210)
> 	at sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:622)
> 	at
> org.apache.lucene.store.NIOFSDirectory$NIOFSIndexInput.readInternal(NIOFSDirectory.java:161)
> 	at
> org.apache.lucene.store.BufferedIndexInput.readBytes(BufferedIndexInput.java:139)
> 	at
> org.apache.lucene.store.BufferedIndexInput.readBytes(BufferedIndexInput.java:94)
> 	at org.apache.lucene.store.DataOutput.copyBytes(DataOutput.java:176)
> 	at
> org.apache.lucene.index.FieldsWriter.addRawDocuments(FieldsWriter.java:209)
> 	at
> org.apache.lucene.index.SegmentMerger.copyFieldsNoDeletions(SegmentMerger.java:424)
> 	at
> org.apache.lucene.index.SegmentMerger.mergeFields(SegmentMerger.java:332)
> 	at org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:153)
> 	at org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:4053)
> 	at org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:3645)
> 	at
> org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:339)
> 	at
> org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:407)
>
>
> Some ptrace output:
> 23178 pread(172,
> "\270\316\276\2\245\371\274\2\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2"...,
> 4096, 98004192) = 4096<0.000009>
> 23178 pread(172,
> "\245\371\274\2\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2"...,
> 4096, 98004196) = 4096<0.000009>
> 23178 pread(172,
> "\271\316\276\2\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2"...,
> 4096, 98004200) = 4096<0.000008>
> 23178 pread(172,
> "\272\316\276\2\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2"...,
> 4096, 98004204) = 4096<0.000008>
> 23178 pread(172,
> "\273\316\276\2\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2"...,
> 4096, 98004208) = 4096<0.000008>
> 23178 pread(172,
> "\274\316\276\2\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2"...,
> 4096, 98004212) = 4096<0.000009>
> 23178 pread(172,
> "\275\316\276\2\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2"...,
> 4096, 98004216) = 4096<0.000008>
> 23178 pread(172,
> "\276\316\276\2\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2"...,
> 4096, 98004220) = 4096<0.000009>
> 23178 pread(172,
> "\277\316\276\2\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2"...,
> 4096, 98004224) = 4096<0.000013>
> 22688<... futex resumed>  )             = -1 ETIMEDOUT (Connection timed
> out)<0.051276>
> 23178 pread(172,
> "\300\316\276\2\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2"...,
> 4096, 98004228) = 4096<0.000010>
> 22688 futex(0x464a9f28, FUTEX_WAKE_PRIVATE, 1
> 23178 pread(172,
> "\301\316\276\2\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2"...,
> 4096, 98004232) = 4096<0.000010>
> 22688<... futex resumed>  )             = 0<0.000051>
> 23178 pread(172,
> "\302\316\276\2\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2"...,
> 4096, 98004236) = 4096<0.000010>
> 22688 clock_gettime(CLOCK_MONOTONIC,
> 23178 pread(172,
> "\367\343\274\2\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2"...,
> 4096, 98004240) = 4096<0.000010>
> 22688<... clock_gettime resumed>  {1900472, 454038316}) = 0<0.000054>
> 23178 pread(172,
> "\246\371\274\2\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2"...,
> 4096, 98004244) = 4096<0.000011>
> 22688 clock_gettime(CLOCK_MONOTONIC,
> 23178 pread(172,
> "\303\316\276\2\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2"...,
> 4096, 98004248) = 4096<0.000010>
> 22688<... clock_gettime resumed>  {1900472, 454169316}) = 0<0.000051>
> 23178 pread(172,
> "\304\316\276\2\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2"...,
> 4096, 98004252) = 4096<0.000010>
> 22688 clock_gettime(CLOCK_MONOTONIC,
> 23178 pread(172,
> "\305\316\276\2\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2"...,
> 4096, 98004256) = 4096<0.000011>
> 22688<... clock_gettime resumed>  {1900472, 454290316}) = 0<0.000049>
> 23178 pread(172,
> "\306\316\276\2\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2"...,
> 4096, 98004260) = 4096<0.000010>
> 22688 clock_gettime(CLOCK_REALTIME,
> 23178 pread(172,
> "\307\316\276\2\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2"...,
> 4096, 98004264) = 4096<0.000010>
> 22688<... clock_gettime resumed>  {1299621913, 884373000}) = 0<0.000050>
> 23178 pread(172,
> "\310\316\276\2\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2"...,
> 4096, 98004268) = 4096<0.000010>
> 22688 futex(0x2aac7406ae34, FUTEX_WAIT_PRIVATE, 1, {0, 49938000}
> 23178 pread(172,
> "\311\316\276\2\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2"...,
> 4096, 98004272) = 4096<0.000008>
> 23178 pread(172,
> "\312\316\276\2\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2"...,
> 4096, 98004276) = 4096<0.000009>
> 23178 pread(172,
> "\313\316\276\2\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2"...,
> 4096, 98004280) = 4096<0.000008>
> 23178 pread(172,
> "\314\316\276\2\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2\322\316\276\2"...,
> 4096, 98004284) = 4096<0.000009>
> 23178 pread(172,
> "\247\371\274\2\315\316\276\2\316\316\276\2\317\316\276\2\320\316\276\2\321\316\276\2\322\316\276\2\323\316\276\2"...,
> 4096, 98004288) = 40
>
>
>
> Thanks
> Dan
>
>
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/Solr-Hanging-all-of-sudden-with-update-csv-tp2652903p2652903.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>