You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@geode.apache.org by ankit Soni <an...@gmail.com> on 2020/12/03 03:58:05 UTC
Re: Geode - store and query JSON documents
Thanks a lot Xiaojian Zhou for your clear explanation and detailed reply.
This has helped a lot to proceed with my experiments.
Ankit.
On Fri, Nov 27, 2020, 5:48 AM Xiaojian Zhou <zh...@vmware.com> wrote:
> Ankit:
>
> I wrote some lucene sample code using your data and query.
>
> I also provided gfsh commands to create nested query.
>
> Note: I purposely provided 2 data to show the difference of query.
>
> package examples;
>
> import org.apache.geode.cache.Region;
> import org.apache.geode.cache.client.ClientCache;
> import org.apache.geode.cache.client.ClientCacheFactory;
> import org.apache.geode.cache.client.ClientRegionShortcut;
> import org.apache.geode.cache.lucene.LuceneQuery;
> import org.apache.geode.cache.lucene.LuceneQueryException;
> import org.apache.geode.cache.lucene.LuceneServiceProvider;
> import org.apache.geode.cache.lucene.PageableLuceneQueryResults;
> import org.apache.geode.cache.lucene.internal.LuceneIndexImpl;
> import org.apache.geode.cache.lucene.internal.LuceneServiceImpl;
> import org.apache.geode.pdx.JSONFormatter;
> import org.apache.geode.pdx.PdxInstance;
>
> import java.io.IOException;
> import java.util.HashSet;
> import java.util.LinkedList;
> import java.util.List;
> import java.util.concurrent.TimeUnit;
> import java.util.concurrent.atomic.AtomicInteger;
>
> public class JSONTest {
> //NOTE: Below is truncated json, single json document can max contain an
> array of col1...col30 (30 diff attributes)
> // within data.
> public final static String jsonDoc_2 = "{" +
> "\"data\":[{" +
> "\"col1\": {" +
> "\"k11\": \"aaa\"," +
> "\"k12\":true," +
> "\"k13\": 1111," +
> "\"k14\": \"2020-12-31:00:00:00\"" +
> "}," +
> "\"col2\":[{" +
> "\"k21\": \"222222\"," +
> "\"k22\": true" +
> "}]" +
> "}]" +
> "}";
> public final static String jsonDoc_3 = "{" +
> "\"data\":[{" +
> "\"col1\": {" +
> "\"k11\": \"bbb\"," +
> "\"k12\":true," +
> "\"k13\": 1111," +
> "\"k14\": \"2020-12-31:00:00:00\"" +
> "}," +
> "\"col2\":[{" +
> "\"k21\": \"333333\"," +
> "\"k22\": true" +
> "}]" +
> "}]" +
> "}";
>
> //NOTE: Col1....col30 are mix of JSONObject ({}) and JSONArray ([]) as
> shown above in jsonDoc_2;
>
> public final static String REGION_NAME = "REGION_NAME";
>
> public static void main(String[] args) throws InterruptedException,
> LuceneQueryException {
>
> //create client-cache
> ClientCache cache = new
> ClientCacheFactory().addPoolLocator("localhost",
> 10334).setPdxReadSerialized(true).create();
> Region<String, PdxInstance> region = cache.<String,
>
> PdxInstance>createClientRegionFactory(ClientRegionShortcut.CACHING_PROXY)
> .create(REGION_NAME);
>
> //store json document
> region.put("key", JSONFormatter.fromJSON(jsonDoc_2));
> region.put("key3", JSONFormatter.fromJSON(jsonDoc_3));
>
> LuceneServiceImpl service = (LuceneServiceImpl)
> LuceneServiceProvider.get(cache);
> LuceneIndexImpl index = (LuceneIndexImpl)
> service.getIndex("jsonIndex", "REGION_NAME");
> if (index != null) {
> service.waitUntilFlushed("jsonIndex", "REGION_NAME", 60000,
> TimeUnit.MILLISECONDS);
> }
>
> LuceneQuery query =
> service.createLuceneQueryFactory().create("jsonIndex", "REGION_NAME",
> "222222 OR 333333", "data.col2.k21");
> System.out.println("Query 222222 OR 333333");
> HashSet results = getResults(query, "REGION_NAME");
>
> LuceneQuery query2 =
> service.createLuceneQueryFactory().create("jsonIndex", "REGION_NAME",
> "aaa OR xxx OR yyy", "data.col1.k11");
> System.out.println("Query aaa OR xxx OR yyy");
> results = getResults(query2, "REGION_NAME");
>
> // server side:
> // gfsh> start locator
> // gfsh> start server --name=server50505 --server-port=50505
> // gfsh> create lucene index --name=jsonIndex --region=/REGION_NAME
> --field=data.col2.k21,data.col1.k11
> // --serializer=org.apache.geode.cache.lucene.FlatFormatSerializer
> // gfsh> create region --name=REGION_NAME --type=PARTITION
> --redundant-copies=1 --total-num-buckets=61
>
> // How to query json document like,
>
> // 1. select col2.k21, col1, col20 from /REGION_NAME where
> //data.col2.k21 = '222222' OR data.col2.k21 = '333333'
>
> // 2. select col2.k21, col1.k11, col1 from /REGION_NAME where
> // data.col1.k11 in ('aaa', 'xxx', 'yyy')
> }
>
> private static HashSet getResults(LuceneQuery query, String regionName)
> throws LuceneQueryException {
> if (query == null) {
> return null;
> }
>
> PageableLuceneQueryResults<Object, Object> results = query.findPages();
> if (results.size() > 0) {
> System.out.println("Search found " + results.size() + " results in "
> + regionName + ", page size is " + query.getPageSize());
> }
>
> HashSet values = new HashSet<>();
> while (results.hasNext()) {
> results.next().stream()
> .forEach(struct -> {
> Object value = struct.getValue();
> if (value instanceof PdxInstance) {
> PdxInstance pdx = (PdxInstance) value;
> String jsonString = JSONFormatter.toJSON(pdx);
> List<PdxInstance> dataList =
> (LinkedList<PdxInstance>)pdx.getField("data");
> for (PdxInstance data:dataList) {
> Object colObject =
> ((PdxInstance)data).getField("col1");
> System.out.println("col="+colObject);
> }
> System.out.println("Found a json object:" +
> jsonString+":dataList="+dataList);
> values.add(pdx);
> } else {
> System.out.println("key=" + struct.getKey() + ",data=" +
> value);
> values.add(value);
> }
> });
> }
> System.out.println("Search found " + values.size() + " results in " +
> regionName);
> return values;
> }
> }
>
> Regards
> Xiaojian
>
> On 11/23/20, 9:46 AM, "Xiaojian Zhou" <zh...@vmware.com> wrote:
>
> Ankit:
>
> Anil can provide you some sample code of OQL query on JSON.
>
> I will find some lucene sample code on JSON for you.
>
> Regards
> Xiaojian
>
> On 11/23/20, 9:27 AM, "ankit Soni" <an...@gmail.com> wrote:
>
> Hi
> I am looking for any means of querying (OQL/Lucene/API etc..?)
> this stored
> data. Looking for achieving this functionality first and second,
> in a
> performant way.
>
> I shared the OQL like syntax, to share my use-case easily and
> based on some
> reference found on doc. I am ok if a Lucene query or some other
> way can
> fetch the results.
>
> It will be of great help if you share the sample query/code
> fetching this
> data .
>
> Thanks
> Ankit.
>
>
> On Mon, 23 Nov 2020 at 22:43, Xiaojian Zhou <zh...@vmware.com>
> wrote:
>
> > Anil:
> >
> > The syntax is OQL. But I understand they want to query JSON
> object base on
> > the criteria.
> >
> > On 11/23/20, 9:08 AM, "Anilkumar Gingade" <ag...@vmware.com>
> wrote:
> >
> > Gester, Looking at the sample query, I Believe Ankit is
> asking about
> > OQL query not Lucene...
> >
> > -Anil.
> >
> >
> > On 11/23/20, 9:02 AM, "Xiaojian Zhou" <zh...@vmware.com>
> wrote:
> >
> > Ankit:
> >
> > Geode provided lucene query on json field. Your query
> can be
> > supported.
> >
> >
> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgemfire.docs.pivotal.io%2F910%2Fgeode%2Ftools_modules%2Flucene_integration.html&data=04%7C01%7Czhouxh%40vmware.com%7Cd1fa267b22a14927924108d88fd7b941%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C637417503982299596%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=CWGBZA0ZgNJi7XST88NM8iJhR360sdLiG5LXCeECrO4%3D&reserved=0
> >
> > However in above document, it did not provided a query
> example on
> > JSON object.
> >
> > I can give you some sample code to query on JSON.
> >
> > Regards
> > Xiaojian Zhou
> >
> > On 11/22/20, 11:53 AM, "ankit Soni" <
> ankit.soni.geode@gmail.com>
> > wrote:
> >
> > Hello geode-devs, please provide a guidance on this.
> >
> > Ankit.
> >
> > On Sat, 21 Nov 2020 at 10:23, ankit Soni <
> > ankit.soni.geode@gmail.com> wrote:
> >
> > > Hello team,
> > >
> > > I am *evaluating usage of Geode (1.12) with
> storing JSON
> > documents and
> > > querying the same*. I am able to store the json
> records
> > successfully in
> > > geode but seeking guidance on how to query them.
> > > More details on code and sample json is,
> > >
> > >
> > > *Sample client-code*
> > >
> > > import org.apache.geode.cache.client.ClientCache;
> > > import
> org.apache.geode.cache.client.ClientCacheFactory;
> > > import
> org.apache.geode.cache.client.ClientRegionShortcut;
> > > import org.apache.geode.pdx.JSONFormatter;
> > > import org.apache.geode.pdx.PdxInstance;
> > >
> > > public class MyTest {
> > >
> > > *//NOTE: Below is truncated json, single json
> document
> > can max contain an array of col1...col30 (30 diff attributes)
> within data. *
> > > public final static String jsonDoc_2 = "{" +
> > > "\"data\":[{" +
> > > "\"col1\": {" +
> > > "\"k11\":
> \"aaa\"," +
> > > "\"k12\":true," +
> > > "\"k13\": 1111," +
> > > "\"k14\":
> > \"2020-12-31:00:00:00\"" +
> > > "}," +
> > > "\"col2\":[{" +
> > > "\"k21\":
> \"222222\"," +
> > > "\"k22\": true" +
> > > "}]" +
> > > "}]" +
> > > "}";
> > >
> > > * //NOTE: Col1....col30 are mix of JSONObject
> ({}) and
> > JSONArray ([]) as shown above in jsonDoc_2;*
> > >
> > > public static void main(String[] args){
> > >
> > > //create client-cache
> > > ClientCache cache = new
> > ClientCacheFactory().addPoolLocator(LOCATOR_HOST, PORT).create();
> > > Region<String, PdxInstance> region =
> cache.<String,
> >
> PdxInstance>createClientRegionFactory(ClientRegionShortcut.CACHING_PROXY)
> > > .create(REGION_NAME);
> > >
> > > //store json document
> > > region.put("key",
> JSONFormatter.fromJSON(jsonDoc_2));
> > >
> > > //How to query json document like,
> > >
> > > // 1. select col2.k21, col1, col20 from
> /REGION_NAME
> > where data.col2.k21 = '222222' OR data.col2.k21 = '333333'
> > >
> > > // 2. select col2.k21, col1.k11, col1 from
> > /REGION_NAME where data.col1.k11 in ('aaa', 'xxx', 'yyy')
> > > }
> > > }
> > >
> > > *Server: Region-creation*
> > >
> > > gfsh> create region --name=REGION_NAME
> --type=PARTITION
> > --redundant-copies=1 --total-num-buckets=61
> > >
> > >
> > > *Setup: Distributed cluster of 3 nodes
> > > *
> > >
> > > *My Observations/Problems*
> > > - Put operation takes excessive time:
> region.put("key",
> > > JSONFormatter.fromJSON(jsonDoc_2)); - Fetching a
> single
> > record from () a
> > > file and Storing in geode approx. takes . 3 secs
> > > Is there any suggestions/configuration related
> to
> > JSONFormatter API or
> > > other to optimize this...?
> > >
> > > *Looking forward to guidance on querying this JOSN
> for above
> > sample
> > > queries.*
> > >
> > > *Thanks*
> > > *Ankit*
> > >
> >
> >
> >
> >
>
>
>