You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by sunnyfr <jo...@gmail.com> on 2008/10/18 17:03:00 UTC
solr 1.3 multi language?
Hi everybody,
I would like you to help me a bit about managing this multi-language part,
actually an example would be excellent.
So I did multi index in one core and I would like you to let me know what
you think about the way that I've managed that, is there more parameter that
I don't know, some help and an example would be great full.
Thanks a lot,
I need to manage this language :
French (FR)
English (EN)
German (DE)
Spanish (ES)
Russian (RU)
Portuguese (Brazilian) (PT)
Polish (PO)
Dutch (NL)
Greek (GR)
Japanese (JA)
Turkish (TR)
My schema looks like that :
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- languages -->
<fieldtype name="text_fr" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French" />
</analyzer>
</fieldtype>
<fieldtype name="text_en" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" />
</analyzer>
</fieldtype>
<fieldtype name="text_de" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="German" />
</analyzer>
</fieldtype>
<fieldtype name="text_es" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Spanish" />
</analyzer>
</fieldtype>
<fieldType name="text_ru" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.ru.RussianAnalyzer"/>
<filter class="solr.SnowballPorterFilterFactory" language="Russian"
/>
</fieldType>
<fieldtype name="text_pt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Portuguese" />
</analyzer>
</fieldtype>
<fieldtype name="text_it" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Italian" />
</analyzer>
</fieldtype>
<fieldtype name="text_nl" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Dutch" />
</analyzer>
</fieldtype>
<fieldType name="text_el" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
</fieldType>
<fieldType name="text_ja" class="solr.TextField">
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
</fieldType>
...................
<field name="title" type="text" indexed="true"
stored="false" />
<field name="title_fr" type="text_fr" indexed="true"
stored="false" />
<field name="title_en" type="text_en" indexed="true"
stored="false" />
<field name="title_de" type="text_de" indexed="true"
stored="false" />
<field name="title_es" type="text_es" indexed="true"
stored="false" />
<field name="title_ru" type="text_ru" indexed="true"
stored="false" />
<field name="title_pt" type="text_pt" indexed="true"
stored="false" />
<field name="title_nl" type="text_nl" indexed="true"
stored="false" />
<field name="title_el" type="text_el" indexed="true"
stored="false" />
<field name="title_ja" type="text_ja" indexed="true"
stored="false" />
<field name="title_it" type="text_it" indexed="true"
stored="false" />
<field name="description" type="text" indexed="true"
stored="false" />
<field name="description_fr" type="text_fr" indexed="true"
stored="false" />
<field name="description_en" type="text_en" indexed="true"
stored="false" />
<field name="description_de" type="text_de" indexed="true"
stored="false" />
<field name="description_es" type="text_es" indexed="true"
stored="false" />
<field name="description_ru" type="text_ru" indexed="true"
stored="false" />
<field name="description_pt" type="text_pt" indexed="true"
stored="false" />
<field name="description_nl" type="text_nl" indexed="true"
stored="false" />
<field name="description_el" type="text_el" indexed="true"
stored="false" />
<field name="description_ja" type="text_ja" indexed="true"
stored="false" />
<field name="description_it" type="text_it" indexed="true"
stored="false" />
..............................
<copyField source="title" dest="text"/>
<copyField source="description" dest="text"/>
<copyField source="tag1" dest="text"/>
<copyField source="tag2" dest="text"/>
<copyField source="tag3" dest="text"/>
<copyField source="tag4" dest="text"/>
<copyField source="owner_login" dest="text"/>
<copyField source="owner_fullname" dest="text"/>
<copyField source="title" dest="spell"/>
----------------------------------------------------------------
My data-config:
video.title,
IF(video.language = 'fr', video.title, NULL) as
title_fr,
IF(video.language = 'en', video.title, NULL) as
title_en,
IF(video.language = 'de', video.title, NULL) as
title_de,
IF(video.language = 'es', video.title, NULL) as
title_es,
IF(video.language = 'nl', video.title, NULL) as
title_nl,
IF(video.language = 'el', video.title, NULL) as
title_el,
IF(video.language = 'ja', video.title, NULL) as
title_ja,
IF(video.language = 'it', video.title, NULL) as
title_it,
video.description,
IF(video.language = 'fr', video.description, NULL) as
description_fr,
IF(video.language = 'en', video.description, NULL) as
description_en,
IF(video.language = 'de', video.description, NULL) as
description_de,
IF(video.language = 'es', video.description, NULL) as
description_es,
IF(video.language = 'nl', video.description, NULL) as
description_nl,
IF(video.language = 'el', video.description, NULL) as
description_el,
IF(video.language = 'ja', video.description, NULL) as
description_ja,
IF(video.language = 'it', video.description, NULL) as
description_it,
....
Thanks for your time,
--
View this message in context: http://www.nabble.com/solr-1.3-multi-language--tp20047636p20047636.html
Sent from the Solr - User mailing list archive at Nabble.com.
Re: solr 1.3 multi language?
Posted by sunnyfr <jo...@gmail.com>.
Hi
Can somebdoy please show me an example for one language ?
I would like to know if for example I define my text_fr like below, if I
need to define in <analyzer> if it's type index or request, does my way to
define below is enough ???
<fieldtype name="text_fr" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French"
/>
</analyzer>
</fieldtype>
Then, I would like to know if I past all this field in text, how should I
define text because by default it's like below, but then I define my
language field text_es and text_en .... what would you reckon?
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"
catenateWords="1" catenateNumbers="1" catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"
catenateWords="0" catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
Thanks a lot,
--
View this message in context: http://www.nabble.com/solr-1.3-multi-language--tp20047636p20064902.html
Sent from the Solr - User mailing list archive at Nabble.com.