You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Paul Taylor <pa...@fastmail.fm> on 2013/01/22 14:56:53 UTC
Is there a problem with my Analyzer subclass ?
I've been investigating potential memory leaks in my Lucene based
application thats runs on jetty. I did a memory dump with jmap and one
thing I've noticed is that for any subclass of analyzer that I have
created that there are alot instances of the $SavedStream inner class.
So for example I can have just fives instances of TitleAnalyzer
analyser, but 417 instances of TitleAnalyzer$SavedStreams. These
$SavedStreams are taking alot of memory, and my profiler (Yourkit)
labels them all but two (415 instances) as strongly reachable, i.e has
strong references so i dont think they can be garbage collected,
therefore suggesting a memory leak.
But i cant see it ?
package org.musicbrainz.search.analysis;
import com.ibm.icu.text.Transliterator;
import org.apache.lucene.analysis.*;
import org.musicbrainz.search.LuceneVersion;
import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;
/**
* Should be used for for analysing titles such as track
title,release title or recording title
* because contains special processing for titles that isn't
required for other text fields such as artist name.
*
* Filters MusicbrainzTokenizer with MusicbrainzTokenizerFilter,
ICUTransformFilter, AccentFilter, LowerCaseFilter
* and no stop words.
*/
public class TitleAnalyzer extends Analyzer {
private NormalizeCharMap charConvertMap;
//We convert to the wrong form No.1 rather than the correct
form No. 1 because this keeps it as single token
//when tokenized so doesn't incorrectly match additional single
numbers in the text.
private Pattern no1Pattern = Pattern.compile("(no\\.) (\\d+)",
Pattern.CASE_INSENSITIVE);
private String no1PatternReplacement = "$1$2";
private void setCharConvertMap() {
charConvertMap = new NormalizeCharMap();
AmpersandToAndMappingHelper.addToMap(charConvertMap);
CharEquivToCharHelper.addToMap(charConvertMap);
HebrewCharMappingHelper.addToMap(charConvertMap);
}
public TitleAnalyzer() {
setCharConvertMap();
}
public final TokenStream tokenStream(String fieldName, Reader
reader) {
CharFilter mappingCharFilter = new
MappingCharFilter(charConvertMap, reader);
CharFilter no1CharFilter = new
PatternReplaceCharFilter(no1Pattern, no1PatternReplacement,
mappingCharFilter);
MusicbrainzTokenizer tokenStream = new
MusicbrainzTokenizer(LuceneVersion.LUCENE_VERSION, no1CharFilter);
TokenStream result = new ICUTransformFilter(tokenStream,
Transliterator.getInstance("[?[:Script=Katakana:]]Katakana-Hiragana"));
result = new ICUTransformFilter(result,
Transliterator.getInstance("Traditional-Simplified"));
result = new MusicbrainzTokenizerFilter(result);
result = new AccentFilter(result);
result = new LowercaseFilter(result);
result = new MusicbrainzWordDelimiterFilter(result,
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
1,
0,
0,
6,
0,
0,
0,
0,
0,
null);
return result;
}
private static final class SavedStreams {
MusicbrainzTokenizer tokenStream;
TokenStream filteredTokenStream;
}
public final TokenStream reusableTokenStream(String fieldName,
Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new
MusicbrainzTokenizer(LuceneVersion.LUCENE_VERSION, new
PatternReplaceCharFilter(no1Pattern, no1PatternReplacement, new
MappingCharFilter(charConvertMap, reader)));
streams.filteredTokenStream = new
ICUTransformFilter(streams.tokenStream,
Transliterator.getInstance("[?[:Script=Katakana:]]Katakana-Hiragana"));
streams.filteredTokenStream = new
ICUTransformFilter(streams.filteredTokenStream,
Transliterator.getInstance("Traditional-Simplified"));
streams.filteredTokenStream = new
MusicbrainzTokenizerFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new
AccentFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new
LowercaseFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new
MusicbrainzWordDelimiterFilter(streams.filteredTokenStream,
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
1,
0,
0,
6,
0,
0,
0,
0,
0,
null);
} else {
streams.tokenStream.reset(new
PatternReplaceCharFilter(no1Pattern, no1PatternReplacement, new
MappingCharFilter(charConvertMap, reader)));
}
return streams.filteredTokenStream;
}
}
Some of the referrences back:
+---org.musicbrainz.search.analysis.TitleAnalyzer$SavedStreams |
58,672 | 32 |
| | | | | |
| | +---value of java.util.WeakHashMap$Entry | 144
| 72 |
| | | | | |
| | +---[251] of java.util.WeakHashMap$Entry[256] |
247,888 | 2,072 |
| | | | | |
| | +---table of java.util.WeakHashMap | 248,048
| 80 |
| | | | | |
| | +---hardRefs of
org.apache.lucene.util.CloseableThreadLocal | 248,112
| 40 |
| | | | | |
| | +---tokenStreams of
org.musicbrainz.search.analysis.TitleWithPosGapAnalyzer |
248,160 | 48 |
| | | | | |
| | +---analyzer of
org.musicbrainz.search.index.RecordingIndexField | 64
| 64 |
| | | | | |
| | +---RELEASE of
org.musicbrainz.search.index.RecordingIndexField | 4,720
| 64 |
| | | | | |
| | +---[1756] of java.lang.Object[2560] |
8,089,136 | 20,504 |
| | | | | |
| | +---elementData of java.util.Vector |
8,089,176 | 40 |
| | | | | |
| | +---classes of
org.eclipse.jetty.webapp.WebAppClassLoader | 8,256,880 |
184 |
| | | | | |
| | +---contextClassLoader of
java.util.TimerThread [Thread] "HashSessionScavenger-1" |
568 | 176 |
| | | | |
| +---org.musicbrainz.search.analysis.TitleAnalyzer$SavedStreams
| 79,192 | 32 |
| | | | | |
| | +---value of java.util.WeakHashMap$Entry | 72
| 72 |
| | | | | |
| | +---[243] of java.util.WeakHashMap$Entry[256] |
247,888 | 2,072 |
| | | | | |
| | +---table of java.util.WeakHashMap | 248,048
| 80 |
| | | | | |
| | +---hardRefs of
org.apache.lucene.util.CloseableThreadLocal | 248,112
| 40 |
| | | | | |
| | +---tokenStreams of
org.musicbrainz.search.analysis.TitleWithPosGapAnalyzer |
248,160 | 48 |
| | | | | |
| | +---analyzer of
org.musicbrainz.search.index.RecordingIndexField | 64
| 64 |
| | | | | |
| | +---RELEASE of
org.musicbrainz.search.index.RecordingIndexField | 4,720
| 64 |
| | | | | |
| | +---[1756] of java.lang.Object[2560] |
8,089,136 | 20,504 |
| | | | | |
| | +---elementData of java.util.Vector |
8,089,176 | 40 |
| | | | | |
| | +---classes of
org.eclipse.jetty.webapp.WebAppClassLoader | 8,256,880 |
184 |
| | | | | |
| | +---contextClassLoader of
java.util.TimerThread [Thread] "HashSessionScavenger-1" |
568 | 176 |
| | | | |
| +---org.musicbrainz.search.analysis.TitleAnalyzer$SavedStreams
| 79,176 | 32 |
| | | | | |
| | +---value of java.util.WeakHashMap$Entry | 72
| 72 |
| | | | | |
| | +---[240] of java.util.WeakHashMap$Entry[256] |
247,888 | 2,072 |
| | | | | |
| | +---table of java.util.WeakHashMap | 248,048
| 80 |
| | | | | |
| | +---hardRefs of
org.apache.lucene.util.CloseableThreadLocal | 248,112
| 40 |
| | | | | |
| | +---tokenStreams of
org.musicbrainz.search.analysis.TitleWithPosGapAnalyzer |
248,160 | 48 |
| | | | | |
| | +---analyzer of
org.musicbrainz.search.index.RecordingIndexField | 64
| 64 |
| | | | | |
| | +---RELEASE of
org.musicbrainz.search.index.RecordingIndexField | 4,720
| 64 |
| | | | | |
| | +---[1756] of java.lang.Object[2560] |
8,089,136 | 20,504 |
| | | | | |
| | +---elementData of java.util.Vector |
8,089,176 | 40 |
| | | | | |
| | +---classes of
org.eclipse.jetty.webapp.WebAppClassLoader | 8,256,880 |
184 |
| | | | | |
| | +---contextClassLoader of
java.util.TimerThread [Thread] "HashSessionScavenger-1" |
568 | 176 |
| | | | |
| +---org.musicbrainz.search.analysis.TitleAnalyzer$SavedStreams
| 79,176 | 32 |
| | | | | |
| | +---value of java.util.WeakHashMap$Entry | 72
| 72 |
| | | | | |
| | +---[239] of java.util.WeakHashMap$Entry[256] |
247,888 | 2,072 |
| | | | | |
| | +---table of java.util.WeakHashMap | 248,048
| 80 |
| | | | | |
| | +---hardRefs of
org.apache.lucene.util.CloseableThreadLocal | 248,112
| 40 |
| | | | | |
| | +---tokenStreams of
org.musicbrainz.search.analysis.TitleWithPosGapAnalyzer |
248,160 | 48 |
| | | | | |
| | +---analyzer of
org.musicbrainz.search.index.RecordingIndexField | 64
| 64 |
| | | | | |
| | +---RELEASE of
org.musicbrainz.search.index.RecordingIndexField | 4,720
| 64 |
| | | | | |
| | +---[1756] of java.lang.Object[2560] |
8,089,136 | 20,504 |
| | | | | |
| | +---elementData of java.util.Vector |
8,089,176 | 40 |
| | | | | |
| | +---classes of
org.eclipse.jetty.webapp.WebAppClassLoader | 8,256,880 |
184 |
| | | | | |
| | +---contextClassLoader of
java.util.TimerThread [Thread] "HashS
essionScavenger-1" | 568 | 176 |
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: Is there a problem with my Analyzer subclass ?
Posted by Paul Taylor <pa...@fastmail.fm>.
I've found a simpler subclass, that illustrates the same problem
package org.musicbrainz.search.analysis;
import org.apache.lucene.analysis.*;
import java.io.IOException;
import java.io.Reader;
/**
* For analyzing catalogno so can compare values containing spaces with
values that do not
* Removes any spaces and lowercases the remaining text
*/
public class StripSpacesAnalyzer extends Analyzer {
protected NormalizeCharMap charConvertMap;
protected void setCharConvertMap() {
charConvertMap = new NormalizeCharMap();
charConvertMap.add(" ","");
}
public StripSpacesAnalyzer() {
setCharConvertMap();
}
public final TokenStream tokenStream(String fieldName,
final Reader reader) {
CharFilter mappingCharFilter = new
MappingCharFilter(charConvertMap,reader);
TokenStream result = new KeywordTokenizer(mappingCharFilter);
result = new LowercaseFilter(result);
return result;
}
private static final class SavedStreams {
KeywordTokenizer tokenStream;
TokenStream filteredTokenStream;
}
public final TokenStream reusableTokenStream(String fieldName,
final Reader reader) throws
IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new KeywordTokenizer(new
MappingCharFilter(charConvertMap, reader));
streams.filteredTokenStream = new
LowercaseFilter(streams.tokenStream);
}
else {
streams.tokenStream.reset(new
MappingCharFilter(charConvertMap,reader));
}
return streams.filteredTokenStream;
}
}
So to reiterate looking at a dump one instance of a StripSpacesAnalyzer
can find itself with multiple instances of the SavedStreams class
conected to it via a WeakHashMap called hardRefs in the tokenStreams
(CloseableThreadLocal) of the superclass, shoudnt there just be the one
per instance of the analyzer ?
(Using Lucene 3.6.0)
Paul
Paul
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org