You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@flume.apache.org by Flavio Pompermaier <po...@okkam.it> on 2013/07/24 00:54:26 UTC
Json to solr example
Hi to all,
I finally manage to make a flow from JSON files to Solr (without solrCell)
and I was thinking this could help someone else..
Obviously this is my solution. Any comment is appreciated!
Note: In my example I should fix the fact that I should use the url field
ad key for the put so I need a way to properly modify/replace
the generateSolrSequenceKey command..
These was my modification to morphine solr tests (I had to add the
dependency to Json morphline of course):
@Test
public void testSolrCellXML() throws Exception {
morphline = createMorphline("test-morphlines/solrCellXML2");
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/somejson.json",
};
testDocumentTypesInternal(files, expectedRecords);
}
this is my somejson.json:
{
"id":"fa10b55e-feac-4e3d-8275-33117ac6da1a",
"url":"someurl",
"meta":{
"timestamp":1372413068,
"language":"en",
"categories":[
"politics",
"computer",
"economy"
]},
"entity":{
"name":"sometext",
"qualifier":"content"
}
}
and this is solrCellXML2:
morphlines : [
{
id : morphline1
importCommands : ["com.cloudera.**"]
commands : [
{
readJson {}
}
{ extractJsonPaths {
flatten : true # to transform arrays in real arrays (not a String
representation)
paths : {
url : /url
last_updated : /meta/timestamp
category : "/meta/categories/[]"
language : /meta/language
content : /entity/name/
}
}
}
{
generateSolrSequenceKey {
baseIdField: base_id
solrLocator : ${SOLR_LOCATOR}
}
}
{
sanitizeUnknownSolrFields {
solrLocator : ${SOLR_LOCATOR}
}
}
{ logDebug { format : "solrcell output: {}", args : ["@{}"] } }
{
loadSolr {
solrLocator : ${SOLR_LOCATOR}
}
}
]
}
]
This is my schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="example-schema" version="1.5">
<fields>
<field name="url" type="string" indexed="true" stored="true"
required="true" multiValued="false" />
<field name="last_updated" type="long" indexed="true" stored="true"
multiValued="false"/>
<field name="category" type="string" indexed="true" stored="true"
multiValued="true" omitTermFreqAndPositions="false" omitNorms="false"/>
<field name="tokenized-url" type="text_general" indexed="true"
stored="true" multiValued="false"/>
<field name="language" type="string" indexed="true" stored="true"
multiValued="false" />
<!-- A wildcard dynamic-field which collects all the possible fields of
an entity. -->
<dynamicField name="*" type="text_ws" indexed="true" stored="true"
multiValued="true" omitTermFreqAndPositions="false" omitNorms="false" />
<field name="_version_" type="long" indexed="true" stored="true"/>
<copyField source="url" dest="tokenized-url"/>
</fields>
<uniqueKey>url</uniqueKey>
<types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" />
<fieldType name="date" class="solr.TrieDateField" precisionStep="0"
positionIncrementGap="0"/>
<fieldType name="tdate" class="solr.TrieDateField" precisionStep="6"
positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0"
positionIncrementGap="0"/>
<fieldType name="text_general" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- A text field that only splits on whitespace for exact matching of
words -->
<fieldType name="text_ws" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
</types>
</schema>
Best,
Flavio