You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@geode.apache.org by ankit Soni <an...@gmail.com> on 2020/12/03 03:58:05 UTC

Re: Geode - store and query JSON documents

Thanks a lot Xiaojian Zhou for your clear explanation and detailed reply.
This has helped a lot to proceed with my experiments.

Ankit.

On Fri, Nov 27, 2020, 5:48 AM Xiaojian Zhou <zh...@vmware.com> wrote:

> Ankit:
>
> I wrote some lucene sample code using your data and query.
>
> I also provided gfsh commands to create nested query.
>
> Note: I purposely provided 2 data to show the difference of query.
>
> package examples;
>
> import org.apache.geode.cache.Region;
> import org.apache.geode.cache.client.ClientCache;
> import org.apache.geode.cache.client.ClientCacheFactory;
> import org.apache.geode.cache.client.ClientRegionShortcut;
> import org.apache.geode.cache.lucene.LuceneQuery;
> import org.apache.geode.cache.lucene.LuceneQueryException;
> import org.apache.geode.cache.lucene.LuceneServiceProvider;
> import org.apache.geode.cache.lucene.PageableLuceneQueryResults;
> import org.apache.geode.cache.lucene.internal.LuceneIndexImpl;
> import org.apache.geode.cache.lucene.internal.LuceneServiceImpl;
> import org.apache.geode.pdx.JSONFormatter;
> import org.apache.geode.pdx.PdxInstance;
>
> import java.io.IOException;
> import java.util.HashSet;
> import java.util.LinkedList;
> import java.util.List;
> import java.util.concurrent.TimeUnit;
> import java.util.concurrent.atomic.AtomicInteger;
>
> public class JSONTest {
>   //NOTE: Below is truncated json, single json document can max contain an
> array of col1...col30 (30 diff attributes)
>   // within data.
>   public final static String jsonDoc_2 = "{" +
>           "\"data\":[{" +
>           "\"col1\": {" +
>           "\"k11\": \"aaa\"," +
>           "\"k12\":true," +
>           "\"k13\": 1111," +
>           "\"k14\": \"2020-12-31:00:00:00\"" +
>           "}," +
>           "\"col2\":[{" +
>           "\"k21\": \"222222\"," +
>           "\"k22\": true" +
>           "}]" +
>           "}]" +
>           "}";
>   public final static String jsonDoc_3 = "{" +
>           "\"data\":[{" +
>           "\"col1\": {" +
>           "\"k11\": \"bbb\"," +
>           "\"k12\":true," +
>           "\"k13\": 1111," +
>           "\"k14\": \"2020-12-31:00:00:00\"" +
>           "}," +
>           "\"col2\":[{" +
>           "\"k21\": \"333333\"," +
>           "\"k22\": true" +
>           "}]" +
>           "}]" +
>           "}";
>
>   //NOTE: Col1....col30 are mix of JSONObject ({}) and JSONArray  ([]) as
> shown above in jsonDoc_2;
>
>   public final static String REGION_NAME = "REGION_NAME";
>
>   public static void main(String[] args) throws InterruptedException,
> LuceneQueryException {
>
>     //create client-cache
>     ClientCache cache = new
>             ClientCacheFactory().addPoolLocator("localhost",
> 10334).setPdxReadSerialized(true).create();
>     Region<String, PdxInstance> region = cache.<String,
>
> PdxInstance>createClientRegionFactory(ClientRegionShortcut.CACHING_PROXY)
>             .create(REGION_NAME);
>
>     //store json document
>     region.put("key", JSONFormatter.fromJSON(jsonDoc_2));
>     region.put("key3", JSONFormatter.fromJSON(jsonDoc_3));
>
>     LuceneServiceImpl service = (LuceneServiceImpl)
> LuceneServiceProvider.get(cache);
>     LuceneIndexImpl index = (LuceneIndexImpl)
> service.getIndex("jsonIndex", "REGION_NAME");
>     if (index != null) {
>       service.waitUntilFlushed("jsonIndex", "REGION_NAME", 60000,
> TimeUnit.MILLISECONDS);
>     }
>
>     LuceneQuery query =
> service.createLuceneQueryFactory().create("jsonIndex", "REGION_NAME",
>             "222222 OR 333333", "data.col2.k21");
>     System.out.println("Query 222222 OR 333333");
>     HashSet results = getResults(query, "REGION_NAME");
>
>     LuceneQuery query2 =
> service.createLuceneQueryFactory().create("jsonIndex", "REGION_NAME",
>             "aaa OR xxx OR yyy", "data.col1.k11");
>     System.out.println("Query aaa OR xxx OR yyy");
>     results = getResults(query2, "REGION_NAME");
>
>     // server side:
>     // gfsh> start locator
>     // gfsh> start server --name=server50505 --server-port=50505
>     // gfsh> create lucene index --name=jsonIndex --region=/REGION_NAME
> --field=data.col2.k21,data.col1.k11
>     // --serializer=org.apache.geode.cache.lucene.FlatFormatSerializer
>     // gfsh> create region --name=REGION_NAME --type=PARTITION
> --redundant-copies=1 --total-num-buckets=61
>
>     // How to query json document like,
>
>     // 1. select col2.k21, col1, col20 from /REGION_NAME where
>     //data.col2.k21 = '222222' OR data.col2.k21 = '333333'
>
>     // 2. select col2.k21, col1.k11, col1 from /REGION_NAME where
>     // data.col1.k11 in ('aaa', 'xxx', 'yyy')
>   }
>
>   private static HashSet getResults(LuceneQuery query, String regionName)
> throws LuceneQueryException {
>     if (query == null) {
>       return null;
>     }
>
>     PageableLuceneQueryResults<Object, Object> results = query.findPages();
>     if (results.size() > 0) {
>       System.out.println("Search found " + results.size() + " results in "
> + regionName + ", page size is " + query.getPageSize());
>     }
>
>     HashSet values = new HashSet<>();
>     while (results.hasNext()) {
>       results.next().stream()
>               .forEach(struct -> {
>                 Object value = struct.getValue();
>                 if (value instanceof PdxInstance) {
>                   PdxInstance pdx = (PdxInstance) value;
>                   String jsonString = JSONFormatter.toJSON(pdx);
>                   List<PdxInstance> dataList =
> (LinkedList<PdxInstance>)pdx.getField("data");
>                   for (PdxInstance data:dataList) {
>                     Object colObject =
> ((PdxInstance)data).getField("col1");
>                     System.out.println("col="+colObject);
>                   }
>                   System.out.println("Found a json object:" +
> jsonString+":dataList="+dataList);
>                   values.add(pdx);
>                 } else {
>                   System.out.println("key=" + struct.getKey() + ",data=" +
> value);
>                   values.add(value);
>                 }
>               });
>     }
>     System.out.println("Search found " + values.size() + " results in " +
> regionName);
>     return values;
>   }
> }
>
> Regards
> Xiaojian
>
> On 11/23/20, 9:46 AM, "Xiaojian Zhou" <zh...@vmware.com> wrote:
>
>     Ankit:
>
>     Anil can provide you some sample code of OQL query on JSON.
>
>     I will find some lucene sample code on JSON for you.
>
>     Regards
>     Xiaojian
>
>     On 11/23/20, 9:27 AM, "ankit Soni" <an...@gmail.com> wrote:
>
>         Hi
>         I am looking for any means of querying (OQL/Lucene/API etc..?)
> this stored
>         data. Looking for achieving this functionality first and second,
> in a
>         performant way.
>
>         I shared the OQL like syntax, to share my use-case easily and
> based on some
>         reference found on doc. I am ok if a Lucene query or some other
> way can
>         fetch the results.
>
>         It will be of great help if you share the sample query/code
> fetching this
>         data .
>
>         Thanks
>         Ankit.
>
>
>         On Mon, 23 Nov 2020 at 22:43, Xiaojian Zhou <zh...@vmware.com>
> wrote:
>
>         > Anil:
>         >
>         > The syntax is OQL. But I understand they want to query JSON
> object base on
>         > the criteria.
>         >
>         > On 11/23/20, 9:08 AM, "Anilkumar Gingade" <ag...@vmware.com>
> wrote:
>         >
>         >     Gester, Looking at the sample query, I Believe Ankit is
> asking about
>         > OQL query not Lucene...
>         >
>         >     -Anil.
>         >
>         >
>         >     On 11/23/20, 9:02 AM, "Xiaojian Zhou" <zh...@vmware.com>
> wrote:
>         >
>         >         Ankit:
>         >
>         >         Geode provided lucene query on json field. Your query
> can be
>         > supported.
>         >
>         >
> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgemfire.docs.pivotal.io%2F910%2Fgeode%2Ftools_modules%2Flucene_integration.html&amp;data=04%7C01%7Czhouxh%40vmware.com%7Cd1fa267b22a14927924108d88fd7b941%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C637417503982299596%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=CWGBZA0ZgNJi7XST88NM8iJhR360sdLiG5LXCeECrO4%3D&amp;reserved=0
>         >
>         >         However in above document, it did not provided a query
> example on
>         > JSON object.
>         >
>         >         I can give you some sample code to query on JSON.
>         >
>         >         Regards
>         >         Xiaojian Zhou
>         >
>         >         On 11/22/20, 11:53 AM, "ankit Soni" <
> ankit.soni.geode@gmail.com>
>         > wrote:
>         >
>         >             Hello geode-devs, please provide a guidance on this.
>         >
>         >             Ankit.
>         >
>         >             On Sat, 21 Nov 2020 at 10:23, ankit Soni <
>         > ankit.soni.geode@gmail.com> wrote:
>         >
>         >             > Hello team,
>         >             >
>         >             > I am *evaluating usage of Geode (1.12) with
> storing JSON
>         > documents and
>         >             > querying the same*. I am able to store the json
> records
>         > successfully in
>         >             > geode but seeking guidance on how to query them.
>         >             > More details on code and sample json is,
>         >             >
>         >             >
>         >             > *Sample client-code*
>         >             >
>         >             > import org.apache.geode.cache.client.ClientCache;
>         >             > import
> org.apache.geode.cache.client.ClientCacheFactory;
>         >             > import
> org.apache.geode.cache.client.ClientRegionShortcut;
>         >             > import org.apache.geode.pdx.JSONFormatter;
>         >             > import org.apache.geode.pdx.PdxInstance;
>         >             >
>         >             > public class MyTest {
>         >             >
>         >             >     *//NOTE: Below is truncated json, single json
> document
>         > can max contain an array of col1...col30 (30 diff attributes)
> within data. *
>         >             >     public final static  String jsonDoc_2 = "{" +
>         >             >             "\"data\":[{" +
>         >             >                         "\"col1\": {" +
>         >             >                                 "\"k11\":
> \"aaa\"," +
>         >             >                                 "\"k12\":true," +
>         >             >                                 "\"k13\": 1111," +
>         >             >                                 "\"k14\":
>         > \"2020-12-31:00:00:00\"" +
>         >             >                                 "}," +
>         >             >                         "\"col2\":[{" +
>         >             >                                 "\"k21\":
> \"222222\"," +
>         >             >                                 "\"k22\": true" +
>         >             >                                 "}]" +
>         >             >                     "}]" +
>         >             >             "}";
>         >             >
>         >             > *     //NOTE: Col1....col30 are mix of JSONObject
> ({}) and
>         > JSONArray ([]) as shown above in jsonDoc_2;*
>         >             >
>         >             >     public static void main(String[] args){
>         >             >
>         >             >         //create client-cache
>         >             >         ClientCache cache = new
>         > ClientCacheFactory().addPoolLocator(LOCATOR_HOST, PORT).create();
>         >             >         Region<String, PdxInstance> region =
> cache.<String,
>         >
> PdxInstance>createClientRegionFactory(ClientRegionShortcut.CACHING_PROXY)
>         >             >                 .create(REGION_NAME);
>         >             >
>         >             >         //store json document
>         >             >         region.put("key",
> JSONFormatter.fromJSON(jsonDoc_2));
>         >             >
>         >             >         //How to query json document like,
>         >             >
>         >             >         // 1. select col2.k21, col1, col20 from
> /REGION_NAME
>         > where data.col2.k21 = '222222' OR data.col2.k21 = '333333'
>         >             >
>         >             >         // 2. select col2.k21, col1.k11, col1 from
>         > /REGION_NAME where data.col1.k11 in ('aaa', 'xxx', 'yyy')
>         >             >     }
>         >             > }
>         >             >
>         >             > *Server: Region-creation*
>         >             >
>         >             > gfsh> create region --name=REGION_NAME
> --type=PARTITION
>         > --redundant-copies=1 --total-num-buckets=61
>         >             >
>         >             >
>         >             > *Setup: Distributed cluster of 3 nodes
>         >             > *
>         >             >
>         >             > *My Observations/Problems*
>         >             > -  Put operation takes excessive time:
> region.put("key",
>         >             > JSONFormatter.fromJSON(jsonDoc_2));  - Fetching a
> single
>         > record from () a
>         >             > file and Storing in geode approx. takes . 3 secs
>         >             >    Is there any suggestions/configuration related
> to
>         > JSONFormatter API or
>         >             > other to optimize this...?
>         >             >
>         >             > *Looking forward to guidance on querying this JOSN
> for above
>         > sample
>         >             > queries.*
>         >             >
>         >             > *Thanks*
>         >             > *Ankit*
>         >             >
>         >
>         >
>         >
>         >
>
>
>