You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sdap.apache.org by le...@apache.org on 2017/10/27 22:34:59 UTC
[18/21] incubator-sdap-mudrod git commit: SDAP-1 Import all code
under the SDAP SGA
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java
new file mode 100644
index 0000000..55ca51d
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java
@@ -0,0 +1,392 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gov.nasa.jpl.mudrod.ontology.process;
+
+import gov.nasa.jpl.mudrod.ontology.Ontology;
+
+import org.apache.jena.ontology.Individual;
+import org.apache.jena.ontology.OntClass;
+import org.apache.jena.ontology.OntModel;
+import org.apache.jena.ontology.OntModelSpec;
+import org.apache.jena.ontology.OntResource;
+import org.apache.jena.ontology.Restriction;
+import org.apache.jena.rdf.model.AnonId;
+import org.apache.jena.rdf.model.Literal;
+import org.apache.jena.rdf.model.ModelFactory;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.shared.PrefixMapping;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.PrintStream;
+import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * The LocalOntology implementation enables us to work with Ontology files
+ * whcih are cached locally and available on the runtime classpath e.g.
+ * in <code>src/main/resource/ontology/...</code>.
+ * From here we can test and iterate on how use of ontology can enhance search.
+ */
+public class LocalOntology implements Ontology {
+
+ public static final Logger LOG = LoggerFactory.getLogger(LocalOntology.class);
+
+ public static final String DELIMITER_SEARCHTERM = " ";
+
+ private Map<Object, Object> searchTerms = new HashMap<>();
+ private static OntologyParser parser;
+ private static OntModel ontologyModel;
+ private Ontology ontology;
+ private static Map<AnonId, String> mAnonIDs = new HashMap<>();
+ private static int mAnonCount = 0;
+ private List<String> ontArrayList;
+
+ public LocalOntology() {
+ //only initialize all the static variables
+ //if first time called to this ontology constructor
+ if (ontology == null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Creating new ontology");
+ }
+ parser = new OwlParser();
+ ontology = this;
+ }
+ if (ontologyModel == null)
+ ontologyModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, null);
+ load();
+ }
+
+ /**
+ * Static accessor for {@link LocalOntology}
+ * instance implementation defined within <code>config.xml</code>.
+ *
+ * @return a {@link LocalOntology}
+ */
+ public Ontology getInstance() {
+ if (ontology == null) {
+ ontology = new LocalOntology();
+ }
+ return ontology;
+ }
+
+ /**
+ * Load the default <i>sweetAll.owl</i> ontology
+ * from <a href="https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl">
+ * https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl</a>
+ */
+ @Override
+ public void load() {
+ URL ontURL = null;
+ try {
+ ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl");
+ //ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/reprDataProduct.owl");
+ } catch (MalformedURLException e) {
+ LOG.error("Error when attempting to create URL resource: ", e);
+ }
+ ontArrayList = new ArrayList<>();
+ try {
+ ontArrayList.add(ontURL.toURI().toString());
+ } catch (URISyntaxException e) {
+ LOG.error("Error in URL syntax, please check your Ontology resource: ", e);
+ }
+ if (!ontArrayList.isEmpty()) {
+ load(ontArrayList.stream().toArray(String[]::new));
+ }
+ }
+
+ /**
+ * Load a string array of local URIs which refernece .owl files.
+ */
+ @Override
+ public void load(String[] urls) {
+ for (int i = 0; i < urls.length; i++) {
+ String url = urls[i].trim();
+ if (!"".equals(url))
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Reading and processing {}", url);
+ }
+ load(ontologyModel, url);
+ }
+ parser.parse(ontology, ontologyModel);
+ }
+
+ private void load(Object m, String url) {
+ try {
+ ((OntModel) m).read(url, null, null);
+ LOG.info("Successfully processed {}", url);
+ } catch (Exception e) {
+ LOG.error("Failed whilst attempting to read ontology {}: Error: ", url, e);
+ }
+ }
+
+ /**
+ * Get the {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser}
+ * implementation being used to process the input ontology resources.
+ * @return an {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser} implementation
+ */
+ public OntologyParser getParser() {
+ if (parser == null) {
+ parser = new OwlParser();
+ }
+ return parser;
+ }
+
+ /**
+ * Return the {@link org.apache.jena.ontology.OntModel} instance
+ * which created from input ontology resources.
+ * @return a constructed {@link org.apache.jena.ontology.OntModel}
+ */
+ public static OntModel getModel() {
+ return ontologyModel;
+ }
+
+ /**
+ * Return the loaded Ontology resources.
+ * @return a {@link java.util.List} of resources.
+ */
+ public List<String> getLoadedOntologyResources() {
+ if (ontArrayList != null) {
+ return ontArrayList;
+ } else {
+ return new ArrayList<>();
+ }
+ }
+ /**
+ * Not yet implemented.
+ */
+ @Override
+ public void merge(Ontology o) {
+ // not yet implemented
+ }
+
+ /**
+ * Retrieve all subclasses of entity(ies) hashed to searchTerm
+ * @param entitySearchTerm a query (keywords) for which to obtain
+ * subclasses.
+ * @return an {@link java.util.Iterator} containing the subclass as Strings.
+ */
+ @Override
+ public Iterator<String> subclasses(String entitySearchTerm) {
+ Map<OntResource, String> classMap = retrieve(entitySearchTerm);
+ Map<String, String> subclasses = new HashMap<>();
+
+ Iterator<OntResource> iter = classMap.keySet().iterator();
+ while (iter.hasNext()) {
+ OntResource resource = iter.next();
+
+ if (resource instanceof OntClass) {
+ //get subclasses N.B. we only get direct sub-classes e.g. direct children
+ //it is possible for us to navigate the entire class tree if we wish, we simply
+ //need to pass the .listSubClasses(true) boolean parameter.
+ for (Iterator<?> i = ((OntClass) resource).listSubClasses(); i.hasNext();) {
+ OntResource subclass = (OntResource) i.next();
+ for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ subclasses.put(l.toString(), "1");
+ }
+ }
+ //get individuals
+ for (Iterator<?> i = ((OntClass) resource).listInstances(); i.hasNext(); ) {
+ OntResource subclass = (OntResource) i.next();
+ for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ subclasses.put(l.toString(), "1");
+ }
+ }
+ } else if (resource instanceof Individual) {
+ for (Iterator<?> i = resource.listSameAs(); i.hasNext();) {
+ OntResource subclass = (OntResource) i.next();
+ for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ subclasses.put(l.toString(), "1");
+ }
+ }
+ }
+ }
+ return subclasses.keySet().iterator();
+ }
+
+ /**
+ * Retreives synonyms for an given phrase if the phrase
+ * is present in the ontology
+ * @param queryKeyPhrase an input string representing a phrase
+ * for which we wish to obtain synonyms.
+ * @return an {@link java.util.Iterator} containing synonyms string tokens
+ * or an empty if no synonyms exist for the given queryKeyPhrase.
+ */
+ @Override
+ public Iterator synonyms(String queryKeyPhrase) {
+
+ Map<?, ?> classMap = retrieve(queryKeyPhrase);
+
+ Map<Object, Object> synonyms = new HashMap<>();
+
+ Iterator<?> iter = classMap.keySet().iterator();
+ while (iter.hasNext()) {
+ OntResource resource = (OntResource) iter.next();
+
+ //listLabels
+ for (Iterator<?> i = resource.listLabels(null); i.hasNext();) {
+ Literal l = (Literal) i.next();
+ synonyms.put(l.toString(), "1");
+ }
+
+ if (resource instanceof Individual) {
+ //get all individuals same as this one
+ for (Iterator<?> i = resource.listSameAs(); i.hasNext();) {
+ Individual individual = (Individual) i.next();
+ //add labels
+ for (Iterator<?> j = individual.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) i.next();
+ synonyms.put(l.toString(), "1");
+ }
+ }
+ } else if (resource instanceof OntClass) {
+ //list equivalent classes
+ for (Iterator<?> i = ((OntClass) resource).listEquivalentClasses(); i.hasNext();) {
+ OntClass equivClass = (OntClass) i.next();
+ //add labels
+ for (Iterator<?> j = equivClass.listLabels(null); j.hasNext();) {
+ Literal l = (Literal) j.next();
+ synonyms.put(l.toString(), "1");
+ }
+ }
+ }
+ }
+
+ return synonyms.keySet().iterator();
+ }
+
+ public void addSearchTerm(String label, OntResource resource) {
+ Map<OntResource, String> m = retrieve(label);
+ if (m == null) {
+ m = new HashMap<>();
+ }
+ m.put(resource, "1");
+ searchTerms.put(label.toLowerCase(), m);
+ }
+
+ /**
+ * A basic lookup function for retrieving keys (phrases or tokens)
+ * from the ontology search terms map. Right now only exact lookups
+ * will retrieve a result... this could be improved by using some
+ * advanced parsing logic... such as Lucene query parser.
+ * @param label the label (phrases or tokens) to retrieve from the
+ * ontology search terms map.
+ * @return an {@link java.util.Map} if there are match(es)
+ * or an empty {@link java.util.HashMap} if there are no
+ * matches.
+ */
+ public Map<OntResource, String> retrieve(String label) {
+ @SuppressWarnings("unchecked")
+ Map<OntResource, String> m = (Map<OntResource, String>) searchTerms.get(label.toLowerCase());
+ if (m == null) {
+ m = new HashMap<>();
+ }
+ return m;
+ }
+
+ protected static void renderHierarchy(PrintStream out, OntClass cls, List<Object> occurs, int depth) {
+ renderClassDescription(out, cls, depth);
+ out.println();
+
+ // recurse to the next level down
+ if (cls.canAs(OntClass.class) && !occurs.contains(cls)) {
+ for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) {
+ OntClass sub = (OntClass) i.next();
+
+ // we push this expression on the occurs list before we recurse
+ occurs.add(cls);
+ renderHierarchy(out, sub, occurs, depth + 1);
+ occurs.remove(cls);
+ }
+ for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) {
+ Individual individual = (Individual) i.next();
+ renderURI(out, individual.getModel(), individual.getURI());
+ out.print(" [");
+ for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) {
+ out.print(((Literal) j.next()).getString() + ", ");
+ }
+ out.print("] ");
+ out.println();
+ }
+ }
+ }
+
+ public static void renderClassDescription(PrintStream out, OntClass c, int depth) {
+ indent(out, depth);
+
+ if (c.isRestriction()) {
+ renderRestriction(out, (Restriction) c.as(Restriction.class));
+ } else {
+ if (!c.isAnon()) {
+ out.print("Class ");
+ renderURI(out, c.getModel(), c.getURI());
+
+ out.print(c.getLocalName());
+
+ out.print(" [");
+ for (Iterator<?> i = c.listLabels(null); i.hasNext(); ) {
+ out.print(((Literal) i.next()).getString() + ", ");
+ }
+ out.print("] ");
+ } else {
+ renderAnonymous(out, c, "class");
+ }
+ }
+ }
+
+ protected static void renderRestriction(PrintStream out, Restriction r) {
+ if (!r.isAnon()) {
+ out.print("Restriction ");
+ renderURI(out, r.getModel(), r.getURI());
+ } else {
+ renderAnonymous(out, r, "restriction");
+ }
+
+ out.print(" on property ");
+ renderURI(out, r.getModel(), r.getOnProperty().getURI());
+ }
+
+ protected static void renderURI(PrintStream out, PrefixMapping prefixes, String uri) {
+ out.print(prefixes.expandPrefix(uri));
+ }
+
+ protected static void renderAnonymous(PrintStream out, Resource anon, String name) {
+ String anonID = mAnonIDs.get(anon.getId());
+ if (anonID == null) {
+ anonID = "a-" + mAnonCount++;
+ mAnonIDs.put(anon.getId(), anonID);
+ }
+
+ out.print("Anonymous ");
+ out.print(name);
+ out.print(" with ID ");
+ out.print(anonID);
+ }
+
+ protected static void indent(PrintStream out, int depth) {
+ for (int i = 0; i < depth; i++) {
+ out.print(" ");
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java
new file mode 100644
index 0000000..a68a0cb
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gov.nasa.jpl.mudrod.ontology.process;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.QueryBuilders;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Properties;
+import java.util.concurrent.ExecutionException;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+
+/**
+ * Supports ability to parse and process FTP and HTTP log files
+ */
+public class OntologyLinkCal extends DiscoveryStepAbstract {
+
+ public OntologyLinkCal(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ es.deleteAllByQuery(props.getProperty("indexName"), props.getProperty("ontologyLinkageType"), QueryBuilders.matchAllQuery());
+ addSWEETMapping();
+ }
+
+ /**
+ * Method of adding mapping for triples extracted from SWEET
+ */
+ public void addSWEETMapping() {
+ XContentBuilder Mapping;
+ try {
+ Mapping = jsonBuilder().startObject().startObject(props.getProperty("ontologyLinkageType")).startObject("properties").startObject("concept_A").field("type", "string")
+ .field("index", "not_analyzed").endObject().startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject()
+
+ .endObject().endObject().endObject();
+
+ es.getClient().admin().indices().preparePutMapping(props.getProperty("indexName")).setType(props.getProperty("ontologyLinkageType")).setSource(Mapping).execute().actionGet();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Method of calculating and importing SWEET triples into Elasticsearch
+ */
+ @Override
+ public Object execute() {
+ es.deleteType(props.getProperty("indexName"), props.getProperty("ontologyLinkageType"));
+ es.createBulkProcessor();
+
+ BufferedReader br = null;
+ String line = "";
+ double weight = 0;
+
+ try {
+ br = new BufferedReader(new FileReader(props.getProperty("oceanTriples")));
+ while ((line = br.readLine()) != null) {
+ String[] strList = line.toLowerCase().split(",");
+ if (strList[1].equals("subclassof")) {
+ weight = 0.75;
+ } else {
+ weight = 0.9;
+ }
+
+ IndexRequest ir = new IndexRequest(props.getProperty("indexName"), props.getProperty("ontologyLinkageType")).source(
+ jsonBuilder().startObject().field("concept_A", es.customAnalyzing(props.getProperty("indexName"), strList[2]))
+ .field("concept_B", es.customAnalyzing(props.getProperty("indexName"), strList[0])).field("weight", weight).endObject());
+ es.getBulkProcessor().add(ir);
+
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ } catch (ExecutionException e) {
+ e.printStackTrace();
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ es.destroyBulkProcessor();
+ es.refreshIndex();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java
new file mode 100644
index 0000000..eca6252
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gov.nasa.jpl.mudrod.ontology.process;
+
+import org.apache.jena.ontology.OntClass;
+import org.apache.jena.ontology.OntModel;
+
+import gov.nasa.jpl.mudrod.ontology.Ontology;
+
+import java.util.Iterator;
+
+/**
+ * Interface for specific ontology parsers e.g. .ttl, RDFXML,
+ * etc.
+ */
+public interface OntologyParser {
+
+ /**
+ * An ontology model (RDF graph) to parse for literals.
+ *
+ * @param ont the associated {@link gov.nasa.jpl.mudrod.ontology.Ontology}
+ * implementation processing the ontology operation(s).
+ * @param ontModel the {@link org.apache.jena.ontology.OntModel}
+ */
+ public void parse(Ontology ont, OntModel ontModel);
+
+ /**
+ * An ontology model (RDF graph) for which to obtain an
+ * {@link java.util.Iterator} instance of all root classes.
+ *
+ * @param ontModel the {@link org.apache.jena.ontology.OntModel}
+ * @return an {@link java.util.Iterator} instance containing all root classes.
+ */
+ public Iterator<OntClass> rootClasses(OntModel ontModel);
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java
new file mode 100644
index 0000000..e43f04d
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gov.nasa.jpl.mudrod.ontology.process;
+
+import org.apache.jena.ontology.Individual;
+import org.apache.jena.ontology.OntClass;
+import org.apache.jena.ontology.OntModel;
+import org.apache.jena.rdf.model.Literal;
+
+import com.esotericsoftware.minlog.Log;
+
+import gov.nasa.jpl.mudrod.ontology.Ontology;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser}
+ * implementation for <a href="http://www.w3.org/TR/owl-features/">W3C OWL</a>
+ * files.
+ */
+public class OwlParser implements OntologyParser {
+
+ private Ontology ont;
+ private List<OntClass> roots = new ArrayList<>();
+
+ public OwlParser() {
+ //default constructor
+ }
+
+ /**
+ * Parse OWL ontology files using Apache Jena
+ */
+ @Override
+ public void parse(Ontology ont, OntModel m) {
+ this.ont = ont;
+ for (Iterator<OntClass> i = rootClasses(m); i.hasNext(); ) {
+ OntClass c = i.next();
+
+ //dont deal with anonymous classes
+ if (c.isAnon()) {
+ continue;
+ }
+
+ parseClass(c, new ArrayList<>(), 0);
+ }
+ }
+
+ protected void parseClass(OntClass cls, List<Object> occurs, int depth) {
+ //dont deal with anonymous classes
+ if (cls.isAnon()) {
+ return;
+ }
+
+ //add cls to Ontology searchterms
+ //list labels
+ Iterator<?> labelIter = cls.listLabels(null);
+ //if has no labels
+ if (!labelIter.hasNext()) {
+ //add rdf:ID as a label
+ cls.addLabel(rdfidToLabel(cls.getLocalName()), null);
+ }
+ //reset the label iterator
+ labelIter = cls.listLabels(null);
+
+ while (labelIter.hasNext()) {
+ Literal l = (Literal) labelIter.next();
+ ((LocalOntology) ont).addSearchTerm(l.toString(), cls);
+ }
+
+ // recurse to the next level down
+ if (cls.canAs(OntClass.class) && !occurs.contains(cls)) {
+ //list subclasses
+ for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) {
+ OntClass sub = (OntClass) i.next();
+
+ // we push this expression on the occurs list before we recurse
+ occurs.add(cls);
+ parseClass(sub, occurs, depth + 1);
+ occurs.remove(cls);
+ }
+
+ //list instances
+ for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) {
+ //add search terms for each instance
+
+ //list labels
+ Individual individual = (Individual) i.next();
+ for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) {
+ Literal l = (Literal) j.next();
+ ((LocalOntology) ont).addSearchTerm(l.toString(), individual);
+ }
+ }
+ }
+ }
+
+ /**
+ * Parses out all root classes of the given
+ * {@link org.apache.jena.ontology.OntModel}
+ * @param m the {@link org.apache.jena.ontology.OntModel} we wish to obtain
+ * all root classes for.
+ * @return an {@link java.util.Iterator} of {@link org.apache.jena.ontology.OntClass}
+ * elements representing all root classes.
+ */
+ @Override
+ public Iterator<OntClass> rootClasses(OntModel m) {
+ Iterator<?> i = m.listClasses();
+ if (i.hasNext() && i.next() instanceof OntClass) {
+ //assume ontology has root classes
+ processSingle(m);
+ } else {
+ //check for presence of aggregate/collection ontologies such as sweetAll.owl
+ processCollection(m);
+ }
+
+ return roots.iterator();
+ }
+
+ private void processSingle(OntModel m) {
+ for (Iterator<?> i = m.listClasses(); i.hasNext(); ) {
+ OntClass c = (OntClass) i.next();
+ try {
+ // too confusing to list all the restrictions as root classes
+ if (c.isAnon()) {
+ continue;
+ }
+
+ if (c.hasSuperClass(m.getProfile().THING(), true) || c.getCardinality(m.getProfile().SUB_CLASS_OF()) == 0) {
+ // this class is directly descended from Thing
+ roots.add(c);
+ }
+ } catch (Exception e) {
+ Log.error("Error during extraction or root Classes from Ontology Model: ", e);
+ }
+ }
+ }
+
+ private void processCollection(OntModel m) {
+ for (Iterator<?> i = m.listSubModels(true); i.hasNext(); ) {
+ OntModel ontModel = (OntModel) i.next();
+ processSingle(ontModel);
+ }
+ }
+
+ public String rdfidToLabel(String idString) {
+ Pattern p = Pattern.compile("([a-z0-9])([A-Z])");
+ Matcher m = p.matcher(idString);
+
+ String labelString = idString;
+ while (m.find()) {
+ labelString = labelString.replaceAll(m.group(1) + m.group(2), m.group(1) + " " + m.group(2));
+ }
+ return labelString;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java
new file mode 100644
index 0000000..3447426
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes ontology processing classes.
+ */
+package gov.nasa.jpl.mudrod.ontology.process;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java
new file mode 100644
index 0000000..1e5d8bf
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java
@@ -0,0 +1,18 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes the preprocessing, processing, and data structure used
+ * by recommendation module.
+ */
+package gov.nasa.jpl.mudrod.recommendation;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java
new file mode 100644
index 0000000..c174f31
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gov.nasa.jpl.mudrod.recommendation.pre;
+
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import gov.nasa.jpl.mudrod.main.MudrodConstants;
+import gov.nasa.jpl.mudrod.metadata.pre.ApiHarvester;
+import org.apache.commons.io.IOUtils;
+import org.elasticsearch.action.index.IndexRequest;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.Properties;
+
+/**
+ * ClassName: Import Metadata to elasticsearch
+ */
+
+public class ImportMetadata extends DiscoveryStepAbstract {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOG = LoggerFactory.getLogger(ApiHarvester.class);
+
+ public ImportMetadata(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ }
+
+ @Override
+ public Object execute() {
+ LOG.info("Starting Metadata Harvesting");
+ startTime = System.currentTimeMillis();
+ addMetadataMapping();
+ importToES();
+ endTime = System.currentTimeMillis();
+ es.refreshIndex();
+ LOG.info("Finished Metadata Harvesting time elapsed: {}s", (endTime - startTime) / 1000);
+ return null;
+ }
+
+ /**
+ * addMetadataMapping: Add mapping to index metadata in Elasticsearch. Please
+ * invoke this method before import metadata to Elasticsearch.
+ */
+ public void addMetadataMapping() {
+ String mappingJson = "{\r\n \"dynamic_templates\": " + "[\r\n " + "{\r\n \"strings\": " + "{\r\n \"match_mapping_type\": \"string\","
+ + "\r\n \"mapping\": {\r\n \"type\": \"string\"," + "\r\n \"analyzer\": \"csv\"\r\n }" + "\r\n }\r\n }\r\n ]\r\n}";
+
+ es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType(props.getProperty("recom_metadataType")).setSource(mappingJson).execute().actionGet();
+
+ }
+
+ /**
+ * importToES: Index metadata into elasticsearch from local file directory.
+ * Please make sure metadata have been harvest from web service before
+ * invoking this method.
+ */
+ private void importToES() {
+ es.deleteType(props.getProperty("indexName"), props.getProperty("recom_metadataType"));
+
+ es.createBulkProcessor();
+ File directory = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH));
+ File[] fList = directory.listFiles();
+ for (File file : fList) {
+ InputStream is;
+ try {
+ is = new FileInputStream(file);
+ try {
+ String jsonTxt = IOUtils.toString(is);
+ JsonParser parser = new JsonParser();
+ JsonElement item = parser.parse(jsonTxt);
+ IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty("recom_metadataType")).source(item.toString());
+
+ // preprocessdata
+
+ es.getBulkProcessor().add(ir);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ es.destroyBulkProcessor();
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java
new file mode 100644
index 0000000..02c74f0
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java
@@ -0,0 +1,100 @@
+/**
+ * Project Name:mudrod-core
+ * File Name:TFIDFGenerator.java
+ * Package Name:gov.nasa.jpl.mudrod.recommendation.pre
+ * Date:Aug 22, 201612:39:52 PM
+ * Copyright (c) 2016, chenzhou1025@126.com All Rights Reserved.
+ */
+
+package gov.nasa.jpl.mudrod.recommendation.pre;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import gov.nasa.jpl.mudrod.recommendation.structure.MetadataOpt;
+import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix;
+import gov.nasa.jpl.mudrod.utils.MatrixUtil;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * ClassName: Generate TFIDF information of all metadata
+ */
+public class MetadataTFIDFGenerator extends DiscoveryStepAbstract {
+
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOG = LoggerFactory.getLogger(MetadataTFIDFGenerator.class);
+
+ /**
+ * Creates a new instance of MatrixGenerator.
+ *
+ * @param props the Mudrod configuration
+ * @param es the Elasticsearch drive
+ * @param spark the spark drive
+ */
+ public MetadataTFIDFGenerator(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ }
+
+ @Override
+ public Object execute() {
+
+ LOG.info("Starting Dataset TF_IDF Matrix Generator");
+ startTime = System.currentTimeMillis();
+ try {
+ generateWordBasedTFIDF();
+ } catch (Exception e) {
+ LOG.error("Error during Dataset TF_IDF Matrix Generation: {}", e);
+ }
+ endTime = System.currentTimeMillis();
+
+ LOG.info("Dataset TF_IDF Matrix Generation complete, time elaspsed: {}s", (endTime - startTime) / 1000);
+
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+ public LabeledRowMatrix generateWordBasedTFIDF() throws Exception {
+
+ MetadataOpt opt = new MetadataOpt(props);
+
+ JavaPairRDD<String, String> metadataContents = opt.loadAll(es, spark);
+
+ JavaPairRDD<String, List<String>> metadataWords = opt.tokenizeData(metadataContents, " ");
+
+ LabeledRowMatrix wordtfidfMatrix = opt.tFIDFTokens(metadataWords, spark);
+
+ MatrixUtil.exportToCSV(wordtfidfMatrix.rowMatrix, wordtfidfMatrix.rowkeys, wordtfidfMatrix.colkeys, props.getProperty("metadata_word_tfidf_matrix"));
+
+ return wordtfidfMatrix;
+ }
+
+ public LabeledRowMatrix generateTermBasedTFIDF() throws Exception {
+
+ MetadataOpt opt = new MetadataOpt(props);
+
+ List<String> variables = new ArrayList<>();
+ variables.add("DatasetParameter-Term");
+ variables.add("DatasetParameter-Variable");
+ variables.add("Dataset-ExtractTerm");
+
+ JavaPairRDD<String, String> metadataContents = opt.loadAll(es, spark, variables);
+
+ JavaPairRDD<String, List<String>> metadataTokens = opt.tokenizeData(metadataContents, ",");
+
+ LabeledRowMatrix tokentfidfMatrix = opt.tFIDFTokens(metadataTokens, spark);
+
+ MatrixUtil.exportToCSV(tokentfidfMatrix.rowMatrix, tokentfidfMatrix.rowkeys, tokentfidfMatrix.colkeys, props.getProperty("metadata_term_tfidf_matrix"));
+
+ return tokentfidfMatrix;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java
new file mode 100644
index 0000000..f5eaa9c
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java
@@ -0,0 +1,223 @@
+package gov.nasa.jpl.mudrod.recommendation.pre;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.update.UpdateRequest;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHit;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+import java.util.regex.Pattern;
+
+public class NormalizeVariables extends DiscoveryStepAbstract {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOG = LoggerFactory.getLogger(NormalizeVariables.class);
+ // index name
+ private String indexName;
+ // type name of metadata in ES
+ private String metadataType;
+
+ /**
+ * Creates a new instance of OHEncoder.
+ *
+ * @param props the Mudrod configuration
+ * @param es an instantiated {@link ESDriver}
+ * @param spark an instantiated {@link SparkDriver}
+ */
+ public NormalizeVariables(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ indexName = props.getProperty("indexName");
+ metadataType = props.getProperty("recom_metadataType");
+ }
+
+ @Override
+ public Object execute() {
+ LOG.info("*****************processing metadata variables starts******************");
+ startTime = System.currentTimeMillis();
+
+ normalizeMetadataVariables(es);
+
+ endTime = System.currentTimeMillis();
+ LOG.info("*****************processing metadata variables ends******************Took {}s", (endTime - startTime) / 1000);
+
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+ public void normalizeMetadataVariables(ESDriver es) {
+
+ es.createBulkProcessor();
+
+ SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute()
+ .actionGet();
+ while (true) {
+ for (SearchHit hit : scrollResp.getHits().getHits()) {
+ Map<String, Object> metadata = hit.getSource();
+ Map<String, Object> updatedValues = new HashMap<>();
+
+ this.normalizeSpatialVariables(metadata, updatedValues);
+ this.normalizeTemporalVariables(metadata, updatedValues);
+ this.normalizeOtherVariables(metadata, updatedValues);
+
+ UpdateRequest ur = es.generateUpdateRequest(indexName, metadataType, hit.getId(), updatedValues);
+ es.getBulkProcessor().add(ur);
+ }
+
+ scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
+ if (scrollResp.getHits().getHits().length == 0) {
+ break;
+ }
+ }
+
+ es.destroyBulkProcessor();
+ }
+
+ private void normalizeOtherVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) {
+ String shortname = (String) metadata.get("Dataset-ShortName");
+ double versionNUm = getVersionNum(shortname);
+ updatedValues.put("Dataset-Derivative-VersionNum", versionNUm);
+
+ }
+
+ private Double getVersionNum(String version) {
+ if (version == null) {
+ return 0.0;
+ }
+ Double versionNum = 0.0;
+ Pattern p = Pattern.compile(".*[a-zA-Z].*");
+ if ("Operational/Near-Real-Time".equals(version)) {
+ versionNum = 2.0;
+ } else if (version.matches("[0-9]{1}[a-zA-Z]{1}")) {
+ versionNum = Double.parseDouble(version.substring(0, 1));
+ } else if (p.matcher(version).find()) {
+ versionNum = 0.0;
+ } else {
+ versionNum = Double.parseDouble(version);
+ if (versionNum >= 5) {
+ versionNum = 20.0;
+ }
+ }
+ return versionNum;
+ }
+
+ private void normalizeSpatialVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) {
+
+ // get spatial resolution
+ Double spatialR;
+ if (metadata.get("Dataset-SatelliteSpatialResolution") != null) {
+ spatialR = (Double) metadata.get("Dataset-SatelliteSpatialResolution");
+ } else {
+ Double gridR = (Double) metadata.get("Dataset-GridSpatialResolution");
+ if (gridR != null) {
+ spatialR = 111 * gridR;
+ } else {
+ spatialR = 25.0;
+ }
+ }
+ updatedValues.put("Dataset-Derivative-SpatialResolution", spatialR);
+
+ // Transform Longitude and calculate coverage area
+ double top = parseDouble((String) metadata.get("DatasetCoverage-NorthLat"));
+ double bottom = parseDouble((String) metadata.get("DatasetCoverage-SouthLat"));
+ double left = parseDouble((String) metadata.get("DatasetCoverage-WestLon"));
+ double right = parseDouble((String) metadata.get("DatasetCoverage-EastLon"));
+
+ if (left > 180) {
+ left = left - 360;
+ }
+
+ if (right > 180) {
+ right = right - 360;
+ }
+
+ if (left == right) {
+ left = -180;
+ right = 180;
+ }
+
+ double area = (top - bottom) * (right - left);
+
+ updatedValues.put("DatasetCoverage-Derivative-EastLon", right);
+ updatedValues.put("DatasetCoverage-Derivative-WestLon", left);
+ updatedValues.put("DatasetCoverage-Derivative-NorthLat", top);
+ updatedValues.put("DatasetCoverage-Derivative-SouthLat", bottom);
+ updatedValues.put("DatasetCoverage-Derivative-Area", area);
+
+ // get processing level
+ String processingLevel = (String) metadata.get("Dataset-ProcessingLevel");
+ double dProLevel = this.getProLevelNum(processingLevel);
+ updatedValues.put("Dataset-Derivative-ProcessingLevel", dProLevel);
+ }
+
+ private void normalizeTemporalVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) {
+
+ String trStr = (String) metadata.get("Dataset-TemporalResolution");
+ if ("".equals(trStr)) {
+ trStr = (String) metadata.get("Dataset-TemporalRepeat");
+ }
+
+ updatedValues.put("Dataset-Derivative-TemporalResolution", covertTimeUnit(trStr));
+ }
+
+ private Double covertTimeUnit(String str) {
+ Double timeInHour;
+ if (str.contains("Hour")) {
+ timeInHour = Double.parseDouble(str.split(" ")[0]);
+ } else if (str.contains("Day")) {
+ timeInHour = Double.parseDouble(str.split(" ")[0]) * 24;
+ } else if (str.contains("Week")) {
+ timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7;
+ } else if (str.contains("Month")) {
+ timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7 * 30;
+ } else if (str.contains("Year")) {
+ timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7 * 30 * 365;
+ } else {
+ timeInHour = 0.0;
+ }
+
+ return timeInHour;
+ }
+
+ public Double getProLevelNum(String pro) {
+ if (pro == null) {
+ return 1.0;
+ }
+ Double proNum = 0.0;
+ Pattern p = Pattern.compile(".*[a-zA-Z].*");
+ if (pro.matches("[0-9]{1}[a-zA-Z]{1}")) {
+ proNum = Double.parseDouble(pro.substring(0, 1));
+ } else if (p.matcher(pro).find()) {
+ proNum = 1.0;
+ } else {
+ proNum = Double.parseDouble(pro);
+ }
+
+ return proNum;
+ }
+
+ private double parseDouble(String strNumber) {
+ if (strNumber != null && strNumber.length() > 0) {
+ try {
+ return Double.parseDouble(strNumber);
+ } catch (Exception e) {
+ return -1;
+ }
+ } else
+ return 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java
new file mode 100644
index 0000000..2aecce3
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java
@@ -0,0 +1,152 @@
+/**
+ * Project Name:mudrod-core
+ * File Name:SessionCooccurenceMatrix.java
+ * Package Name:gov.nasa.jpl.mudrod.recommendation.pre
+ * Date:Aug 19, 20163:06:33 PM
+ * Copyright (c) 2016, chenzhou1025@126.com All Rights Reserved.
+ */
+
+package gov.nasa.jpl.mudrod.recommendation.pre;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import gov.nasa.jpl.mudrod.main.MudrodConstants;
+import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix;
+import gov.nasa.jpl.mudrod.utils.MatrixUtil;
+import gov.nasa.jpl.mudrod.weblog.structure.SessionExtractor;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.function.PairFunction;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHit;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.util.*;
+
+/**
+ * ClassName: SessionCooccurenceMatrix Function: Generate metadata session
+ * coocucurence matrix from web logs. Each row in the matrix is corresponding to
+ * a metadata, and each column is a session.
+ */
+public class SessionCooccurence extends DiscoveryStepAbstract {
+
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOG = LoggerFactory.getLogger(SessionCooccurence.class);
+
+ /**
+ * Creates a new instance of SessionCooccurence.
+ *
+ * @param props
+ * the Mudrod configuration
+ * @param es
+ * the Elasticsearch drive
+ * @param spark
+ * the spark driver
+ */
+ public SessionCooccurence(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ }
+
+ @Override
+ public Object execute() {
+
+ LOG.info("Starting dataset session-based similarity generation...");
+
+ startTime = System.currentTimeMillis();
+
+ // get all metadata session cooccurance data
+ SessionExtractor extractor = new SessionExtractor();
+ JavaPairRDD<String, List<String>> sessionDatasetRDD = extractor.bulidSessionDatasetRDD(props, es, spark);
+
+ // remove retired datasets
+ JavaPairRDD<String, List<String>> sessionFiltedDatasetsRDD = removeRetiredDataset(es, sessionDatasetRDD);
+ LabeledRowMatrix datasetSessionMatrix = MatrixUtil.createWordDocMatrix(sessionFiltedDatasetsRDD);
+
+ // export
+ MatrixUtil.exportToCSV(datasetSessionMatrix.rowMatrix, datasetSessionMatrix.rowkeys, datasetSessionMatrix.colkeys, props.getProperty("session_metadata_Matrix"));
+
+ endTime = System.currentTimeMillis();
+
+ LOG.info("Completed dataset session-based similarity generation. Time elapsed: {}s", (endTime - startTime) / 1000);
+
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+ /**
+ * filter out-of-data metadata
+ *
+ * @param es
+ * the Elasticsearch drive
+ * @param userDatasetsRDD
+ * dataset extracted from session
+ * @return filtered session datasets
+ */
+ public JavaPairRDD<String, List<String>> removeRetiredDataset(ESDriver es, JavaPairRDD<String, List<String>> userDatasetsRDD) {
+
+ Map<String, String> nameMap = this.getOnServiceMetadata(es);
+
+ return userDatasetsRDD.mapToPair(new PairFunction<Tuple2<String, List<String>>, String, List<String>>() {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public Tuple2<String, List<String>> call(Tuple2<String, List<String>> arg0) throws Exception {
+ List<String> oriDatasets = arg0._2;
+ List<String> newDatasets = new ArrayList<>();
+ int size = oriDatasets.size();
+ for (int i = 0; i < size; i++) {
+ String name = oriDatasets.get(i);
+ if (nameMap.containsKey(name)) {
+ newDatasets.add(nameMap.get(name));
+ }
+ }
+ return new Tuple2<>(arg0._1, newDatasets);
+ }
+ });
+
+ }
+
+ /**
+ * getMetadataNameMap: Get on service metadata names, key is lowcase of short
+ * name and value is the original short name
+ *
+ * @param es
+ * the elasticsearch client
+ * @return a map from lower case metadata name to original metadata name
+ */
+ private Map<String, String> getOnServiceMetadata(ESDriver es) {
+
+ String indexName = props.getProperty(MudrodConstants.ES_INDEX_NAME);
+ String metadataType = props.getProperty("recom_metadataType");
+
+ Map<String, String> shortnameMap = new HashMap<>();
+ SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute()
+ .actionGet();
+ while (true) {
+ for (SearchHit hit : scrollResp.getHits().getHits()) {
+ Map<String, Object> metadata = hit.getSource();
+ String shortName = (String) metadata.get("Dataset-ShortName");
+ shortnameMap.put(shortName.toLowerCase(), shortName);
+ }
+
+ scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
+ if (scrollResp.getHits().getHits().length == 0) {
+ break;
+ }
+ }
+
+ return shortnameMap;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java
new file mode 100644
index 0000000..2febf96
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes the preprocessing required by recommendation module.
+ */
+package gov.nasa.jpl.mudrod.recommendation.pre;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java
new file mode 100644
index 0000000..b0e93fc
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java
@@ -0,0 +1,74 @@
+/**
+ * Project Name:mudrod-core
+ * File Name:TopicBasedCF.java
+ * Package Name:gov.nasa.jpl.mudrod.recommendation.process
+ * Date:Aug 22, 201610:45:55 AM
+ * Copyright (c) 2016, chenzhou1025@126.com All Rights Reserved.
+ */
+
+package gov.nasa.jpl.mudrod.recommendation.process;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import gov.nasa.jpl.mudrod.semantics.SVDAnalyzer;
+import gov.nasa.jpl.mudrod.utils.LinkageTriple;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * ClassName: Recommend metedata based on data content semantic similarity
+ */
+public class AbstractBasedSimilarity extends DiscoveryStepAbstract {
+
+ private static final Logger LOG = LoggerFactory.getLogger(AbstractBasedSimilarity.class);
+
+ /**
+ * Creates a new instance of TopicBasedCF.
+ *
+ * @param props the Mudrod configuration
+ * @param es the Elasticsearch client
+ * @param spark the spark drive
+ */
+ public AbstractBasedSimilarity(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ }
+
+ @Override
+ public Object execute() {
+
+ LOG.info("*****************abstract similarity calculation starts******************");
+ startTime = System.currentTimeMillis();
+
+ try {
+ /*String topicMatrixFile = props.getProperty("metadata_term_tfidf_matrix");
+ SemanticAnalyzer analyzer = new SemanticAnalyzer(props, es, spark);
+ List<LinkageTriple> triples = analyzer
+ .calTermSimfromMatrix(topicMatrixFile);
+ analyzer.saveToES(triples, props.getProperty("indexName"),
+ props.getProperty("metadataTermTFIDFSimType"), true, true);*/
+
+ // for comparison
+ SVDAnalyzer svd = new SVDAnalyzer(props, es, spark);
+ svd.getSVDMatrix(props.getProperty("metadata_word_tfidf_matrix"), 150, props.getProperty("metadata_word_tfidf_matrix"));
+ List<LinkageTriple> tripleList = svd.calTermSimfromMatrix(props.getProperty("metadata_word_tfidf_matrix"));
+ svd.saveToES(tripleList, props.getProperty("indexName"), props.getProperty("metadataWordTFIDFSimType"), true, true);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ endTime = System.currentTimeMillis();
+ LOG.info("*****************abstract similarity calculation ends******************Took {}s", (endTime - startTime) / 1000);
+
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java
new file mode 100644
index 0000000..67aeeb8
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java
@@ -0,0 +1,380 @@
+package gov.nasa.jpl.mudrod.recommendation.process;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.update.UpdateRequest;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHit;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.text.DecimalFormat;
+import java.util.*;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+
+public class VariableBasedSimilarity extends DiscoveryStepAbstract implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private static final Logger LOG = LoggerFactory.getLogger(VariableBasedSimilarity.class);
+
+ private DecimalFormat df = new DecimalFormat("#.000");
+ // a map from variable to its type
+ public Map<String, Integer> variableTypes;
+ public Map<String, Integer> variableWeights;
+
+ private static final Integer VAR_SPATIAL = 1;
+ private static final Integer VAR_TEMPORAL = 2;
+ private static final Integer VAR_CATEGORICAL = 3;
+ private static final Integer VAR_ORDINAL = 4;
+
+ // index name
+ private String indexName;
+ // type name of metadata in ES
+ private String metadataType;
+ private String variableSimType;
+
+ /**
+ * Creates a new instance of OHEncoder.
+ *
+ * @param props the Mudrod configuration
+ * @param es an instantiated {@link ESDriver}
+ * @param spark an instantiated {@link SparkDriver}
+ */
+ public VariableBasedSimilarity(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+
+ indexName = props.getProperty("indexName");
+ metadataType = props.getProperty("recom_metadataType");
+ variableSimType = props.getProperty("metadataCodeSimType");
+ this.inital();
+ }
+
+ @Override
+ public Object execute() {
+ LOG.info("*****************calculating metadata variables based similarity starts******************");
+ startTime = System.currentTimeMillis();
+ es.deleteType(indexName, variableSimType);
+ addMapping(es, indexName, variableSimType);
+
+ VariableBasedSimilarity(es);
+ es.refreshIndex();
+ normalizeVariableWeight(es);
+ es.refreshIndex();
+ endTime = System.currentTimeMillis();
+ LOG.info("*****************calculating metadata variables based similarity ends******************Took {}s", (endTime - startTime) / 1000);
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+ public void inital() {
+ this.initVariableType();
+ this.initVariableWeight();
+ }
+
+ private void initVariableType() {
+ variableTypes = new HashMap<>();
+
+ variableTypes.put("DatasetParameter-Variable", VAR_CATEGORICAL);
+ variableTypes.put("DatasetRegion-Region", VAR_CATEGORICAL);
+ variableTypes.put("Dataset-ProjectionType", VAR_CATEGORICAL);
+ variableTypes.put("Dataset-ProcessingLevel", VAR_CATEGORICAL);
+ variableTypes.put("DatasetParameter-Topic", VAR_CATEGORICAL);
+ variableTypes.put("DatasetParameter-Term", VAR_CATEGORICAL);
+ variableTypes.put("DatasetParameter-Category", VAR_CATEGORICAL);
+ variableTypes.put("DatasetPolicy-DataFormat", VAR_CATEGORICAL);
+ variableTypes.put("Collection-ShortName", VAR_CATEGORICAL);
+ variableTypes.put("DatasetSource-Source-Type", VAR_CATEGORICAL);
+ variableTypes.put("DatasetSource-Source-ShortName", VAR_CATEGORICAL);
+ variableTypes.put("DatasetSource-Sensor-ShortName", VAR_CATEGORICAL);
+ variableTypes.put("DatasetPolicy-Availability", VAR_CATEGORICAL);
+ variableTypes.put("Dataset-Provider-ShortName", VAR_CATEGORICAL);
+
+ variableTypes.put("Dataset-Derivative-ProcessingLevel", VAR_ORDINAL);
+ variableTypes.put("Dataset-Derivative-TemporalResolution", VAR_ORDINAL);
+ variableTypes.put("Dataset-Derivative-SpatialResolution", VAR_ORDINAL);
+ }
+
+ private void initVariableWeight() {
+ variableWeights = new HashMap<>();
+
+ variableWeights.put("Dataset-Derivative-ProcessingLevel", 5);
+ variableWeights.put("DatasetParameter-Category", 5);
+ variableWeights.put("DatasetParameter-Variable", 5);
+ variableWeights.put("DatasetSource-Sensor-ShortName", 5);
+
+ variableWeights.put("DatasetPolicy-Availability", 4);
+ variableWeights.put("DatasetRegion-Region", 4);
+ variableWeights.put("DatasetSource-Source-Type", 4);
+ variableWeights.put("DatasetSource-Source-ShortName", 4);
+ variableWeights.put("DatasetParameter-Term", 4);
+ variableWeights.put("DatasetPolicy-DataFormat", 4);
+ variableWeights.put("Dataset-Derivative-SpatialResolution", 4);
+ variableWeights.put("Temporal_Covergae", 4);
+
+ variableWeights.put("DatasetParameter-Topic", 3);
+ variableWeights.put("Collection-ShortName", 3);
+ variableWeights.put("Dataset-Derivative-TemporalResolution", 3);
+ variableWeights.put("Spatial_Covergae", 3);
+
+ variableWeights.put("Dataset-ProjectionType", 1);
+ variableWeights.put("Dataset-Provider-ShortName", 1);
+ }
+
+ public void VariableBasedSimilarity(ESDriver es) {
+
+ es.createBulkProcessor();
+
+ List<Map<String, Object>> metadatas = new ArrayList<>();
+ SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute()
+ .actionGet();
+ while (true) {
+ for (SearchHit hit : scrollResp.getHits().getHits()) {
+ Map<String, Object> metadataA = hit.getSource();
+ metadatas.add(metadataA);
+ }
+
+ scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
+ if (scrollResp.getHits().getHits().length == 0) {
+ break;
+ }
+ }
+
+ int size = metadatas.size();
+ for (int i = 0; i < size; i++) {
+ Map<String, Object> metadataA = metadatas.get(i);
+ String shortNameA = (String) metadataA.get("Dataset-ShortName");
+
+ for (int j = 0; j < size; j++) {
+ metadataA = metadatas.get(i);
+ Map<String, Object> metadataB = metadatas.get(j);
+ String shortNameB = (String) metadataB.get("Dataset-ShortName");
+
+ try {
+ XContentBuilder contentBuilder = jsonBuilder().startObject();
+ contentBuilder.field("concept_A", shortNameA);
+ contentBuilder.field("concept_B", shortNameB);
+
+ // spatial similarity
+ this.spatialSimilarity(metadataA, metadataB, contentBuilder);
+ // temporal similarity
+ this.temporalSimilarity(metadataA, metadataB, contentBuilder);
+ // categorical variables similarity
+ this.categoricalVariablesSimilarity(metadataA, metadataB, contentBuilder);
+ // ordinal variables similarity
+ this.ordinalVariablesSimilarity(metadataA, metadataB, contentBuilder);
+
+ contentBuilder.endObject();
+
+ IndexRequest ir = new IndexRequest(indexName, variableSimType).source(contentBuilder);
+ es.getBulkProcessor().add(ir);
+
+ } catch (IOException e1) {
+ e1.printStackTrace();
+ }
+
+ }
+ }
+
+ es.destroyBulkProcessor();
+ }
+
+ /*
+ * refer to P. Frontiera, R. Larson, and J. Radke (2008) A comparison of
+ geometric approaches to assessing spatial similarity for GIR.
+ International Journal of Geographical Information Science,
+ 22(3)
+ */
+ public void spatialSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException {
+
+ double topA = (double) metadataA.get("DatasetCoverage-Derivative-NorthLat");
+ double bottomA = (double) metadataA.get("DatasetCoverage-Derivative-SouthLat");
+ double leftA = (double) metadataA.get("DatasetCoverage-Derivative-WestLon");
+ double rightA = (double) metadataA.get("DatasetCoverage-Derivative-EastLon");
+ double areaA = (double) metadataA.get("DatasetCoverage-Derivative-Area");
+
+ double topB = (double) metadataB.get("DatasetCoverage-Derivative-NorthLat");
+ double bottomB = (double) metadataB.get("DatasetCoverage-Derivative-SouthLat");
+ double leftB = (double) metadataB.get("DatasetCoverage-Derivative-WestLon");
+ double rightB = (double) metadataB.get("DatasetCoverage-Derivative-EastLon");
+ double areaB = (double) metadataB.get("DatasetCoverage-Derivative-Area");
+
+ // Intersect area
+ double xOverlap = Math.max(0, Math.min(rightA, rightB) - Math.max(leftA, leftB));
+ double yOverlap = Math.max(0, Math.min(topA, topB) - Math.max(bottomA, bottomB));
+ double overlapArea = xOverlap * yOverlap;
+
+ // Calculate coverage similarity
+ double similarity = 0.0;
+ if (areaA > 0 && areaB > 0) {
+ similarity = (overlapArea / areaA + overlapArea / areaB) * 0.5;
+ }
+
+ contentBuilder.field("Spatial_Covergae_Sim", similarity);
+ }
+
+ public void temporalSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException {
+
+ double similarity = 0.0;
+ double startTimeA = Double.parseDouble((String) metadataA.get("Dataset-DatasetCoverage-StartTimeLong"));
+ String endTimeAStr = (String) metadataA.get("Dataset-DatasetCoverage-StopTimeLong");
+ double endTimeA = 0.0;
+ if ("".equals(endTimeAStr)) {
+ endTimeA = System.currentTimeMillis();
+ } else {
+ endTimeA = Double.parseDouble(endTimeAStr);
+ }
+ double timespanA = endTimeA - startTimeA;
+
+ double startTimeB = Double.parseDouble((String) metadataB.get("Dataset-DatasetCoverage-StartTimeLong"));
+ String endTimeBStr = (String) metadataB.get("Dataset-DatasetCoverage-StopTimeLong");
+ double endTimeB = 0.0;
+ if ("".equals(endTimeBStr)) {
+ endTimeB = System.currentTimeMillis();
+ } else {
+ endTimeB = Double.parseDouble(endTimeBStr);
+ }
+ double timespanB = endTimeB - startTimeB;
+
+ double intersect = 0.0;
+ if (startTimeB >= endTimeA || endTimeB <= startTimeA) {
+ intersect = 0.0;
+ } else if (startTimeB >= startTimeA && endTimeB <= endTimeA) {
+ intersect = timespanB;
+ } else if (startTimeA >= startTimeB && endTimeA <= endTimeB) {
+ intersect = timespanA;
+ } else {
+ intersect = (startTimeA > startTimeB) ? (endTimeB - startTimeA) : (endTimeA - startTimeB);
+ }
+
+ similarity = intersect / (Math.sqrt(timespanA) * Math.sqrt(timespanB));
+ contentBuilder.field("Temporal_Covergae_Sim", similarity);
+ }
+
+ public void categoricalVariablesSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException {
+
+ for (String variable : variableTypes.keySet()) {
+ Integer type = variableTypes.get(variable);
+ if (type != VAR_CATEGORICAL) {
+ continue;
+ }
+
+ double similarity = 0.0;
+ Object valueA = metadataA.get(variable);
+ Object valueB = metadataB.get(variable);
+ if (valueA instanceof ArrayList) {
+ ArrayList<String> aList = (ArrayList<String>) valueA;
+ ArrayList<String> bList = (ArrayList<String>) valueB;
+ if (aList != null && bList != null) {
+
+ int lengthA = aList.size();
+ int lengthB = bList.size();
+ List<String> newAList = new ArrayList<>(aList);
+ List<String> newBList = new ArrayList<>(bList);
+ newAList.retainAll(newBList);
+ similarity = newAList.size() / lengthA;
+ }
+
+ } else if (valueA instanceof String) {
+ if (valueA.equals(valueB)) {
+ similarity = 1.0;
+ }
+ }
+
+ contentBuilder.field(variable + "_Sim", similarity);
+ }
+ }
+
+ public void ordinalVariablesSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException {
+ for (String variable : variableTypes.keySet()) {
+ Integer type = variableTypes.get(variable);
+ if (type != VAR_ORDINAL) {
+ continue;
+ }
+
+ double similarity = 0.0;
+ Object valueA = metadataA.get(variable);
+ Object valueB = metadataB.get(variable);
+ if (valueA != null && valueB != null) {
+
+ double a = (double) valueA;
+ double b = (double) valueB;
+ if (a != 0.0) {
+ similarity = 1 - Math.abs(b - a) / a;
+ if (similarity < 0) {
+ similarity = 0.0;
+ }
+ }
+ }
+
+ contentBuilder.field(variable + "_Sim", similarity);
+ }
+ }
+
+ public static void addMapping(ESDriver es, String index, String type) {
+ XContentBuilder Mapping;
+ try {
+ Mapping = jsonBuilder().startObject().startObject(type).startObject("properties").startObject("concept_A").field("type", "string").field("index", "not_analyzed").endObject()
+ .startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject()
+
+ .endObject().endObject().endObject();
+
+ es.getClient().admin().indices().preparePutMapping(index).setType(type).setSource(Mapping).execute().actionGet();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void normalizeVariableWeight(ESDriver es) {
+
+ es.createBulkProcessor();
+
+ double totalWeight = 0.0;
+ for (String variable : variableWeights.keySet()) {
+ totalWeight += variableWeights.get(variable);
+ }
+
+ SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(variableSimType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute()
+ .actionGet();
+ while (true) {
+ for (SearchHit hit : scrollResp.getHits().getHits()) {
+ Map<String, Object> similarities = hit.getSource();
+
+ double totalSim = 0.0;
+ for (String variable : variableWeights.keySet()) {
+ if (similarities.containsKey(variable + "_Sim")) {
+ double value = (double) similarities.get(variable + "_Sim");
+ double weight = variableWeights.get(variable);
+ totalSim += weight * value;
+ }
+ }
+
+ double weight = totalSim / totalWeight;
+ UpdateRequest ur = es.generateUpdateRequest(indexName, variableSimType, hit.getId(), "weight", weight);
+ es.getBulkProcessor().add(ur);
+ }
+
+ scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
+ if (scrollResp.getHits().getHits().length == 0) {
+ break;
+ }
+ }
+
+ es.destroyBulkProcessor();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java
new file mode 100644
index 0000000..84231f7
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes the processing required by recommendation module.
+ */
+package gov.nasa.jpl.mudrod.recommendation.process;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java
new file mode 100644
index 0000000..ae55769
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java
@@ -0,0 +1,74 @@
+/**
+ * Project Name:mudrod-core
+ * File Name:sessionBasedCF.java
+ * Package Name:gov.nasa.jpl.mudrod.recommendation.process
+ * Date:Aug 19, 20163:17:00 PM
+ * Copyright (c) 2016, chenzhou1025@126.com All Rights Reserved.
+ */
+
+package gov.nasa.jpl.mudrod.recommendation.process;
+
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import gov.nasa.jpl.mudrod.semantics.SemanticAnalyzer;
+import gov.nasa.jpl.mudrod.utils.LinkageTriple;
+import gov.nasa.jpl.mudrod.utils.SimilarityUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * ClassName: Recommend metedata based on session level co-occurrence
+ */
+public class sessionBasedCF extends DiscoveryStepAbstract {
+
+ private static final Logger LOG = LoggerFactory.getLogger(sessionBasedCF.class);
+
+ /**
+ * Creates a new instance of sessionBasedCF.
+ *
+ * @param props
+ * the Mudrod configuration
+ * @param es
+ * the Elasticsearch drive
+ * @param spark
+ * the spark drive
+ */
+ public sessionBasedCF(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ }
+
+ @Override
+ public Object execute() {
+ LOG.info("*****************Session based metadata similarity starts******************");
+ startTime = System.currentTimeMillis();
+
+ try {
+ String session_metadatFile = props.getProperty("session_metadata_Matrix");
+ File f = new File(session_metadatFile);
+ if (f.exists()) {
+ SemanticAnalyzer analyzer = new SemanticAnalyzer(props, es, spark);
+ List<LinkageTriple> triples = analyzer.calTermSimfromMatrix(session_metadatFile, SimilarityUtil.SIM_PEARSON, 1);
+ analyzer.saveToES(triples, props.getProperty("indexName"), props.getProperty("metadataSessionBasedSimType"), true, false);
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ endTime = System.currentTimeMillis();
+ LOG.info("*****************Session based metadata similarity ends******************Took {}s", (endTime - startTime) / 1000);
+
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/7b76fa16/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java
new file mode 100644
index 0000000..4163fda
--- /dev/null
+++ b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gov.nasa.jpl.mudrod.recommendation.structure;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
+import gov.nasa.jpl.mudrod.driver.ESDriver;
+import gov.nasa.jpl.mudrod.driver.SparkDriver;
+import gov.nasa.jpl.mudrod.main.MudrodEngine;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHit;
+import org.elasticsearch.search.sort.SortOrder;
+
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.*;
+
+/**
+ * Recommend metadata using combination all two methods, including content-based
+ * similarity and session-level similarity
+ */
+public class HybridRecommendation extends DiscoveryStepAbstract {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ // recommended metadata list
+ protected transient List<LinkedTerm> termList = new ArrayList<>();
+ // format decimal
+ DecimalFormat df = new DecimalFormat("#.00");
+ // index name
+ protected static final String INDEX_NAME = "indexName";
+ private static final String WEIGHT = "weight";
+
+ /**
+ * recommended data class Date: Sep 12, 2016 2:25:28 AM
+ */
+ class LinkedTerm {
+ public String term = null;
+ public double weight = 0;
+ public String model = null;
+
+ public LinkedTerm(String str, double w, String m) {
+ term = str;
+ weight = w;
+ model = m;
+ }
+ }
+
+ public HybridRecommendation(Properties props, ESDriver es, SparkDriver spark) {
+ super(props, es, spark);
+ }
+
+ @Override
+ public Object execute() {
+ return null;
+ }
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
+
+ /**
+ * Get recommended data for a giving dataset
+ *
+ * @param input: a giving dataset
+ * @param num: the number of recommended dataset
+ * @return recommended dataset in json format
+ */
+ public JsonObject getRecomDataInJson(String input, int num) {
+ JsonObject resultJson = new JsonObject();
+
+ String type = props.getProperty("metadataCodeSimType");
+ Map<String, Double> sortedVariableSimMap = getRelatedData(type, input, num + 10);
+
+ type = props.getProperty("metadataWordTFIDFSimType");
+ Map<String, Double> sortedAbstractSimMap = getRelatedData(type, input, num + 10);
+
+ type = props.getProperty("metadataSessionBasedSimType");
+ Map<String, Double> sortedSessionSimMap = getRelatedData(type, input, num + 10);
+
+ JsonElement variableSimJson = mapToJson(sortedVariableSimMap, num);
+ resultJson.add("variableSim", variableSimJson);
+ JsonElement abstractSimJson = mapToJson(sortedAbstractSimMap, num);
+ resultJson.add("abstractSim", abstractSimJson);
+ JsonElement sessionSimJson = mapToJson(sortedSessionSimMap, num);
+ resultJson.add("sessionSim", sessionSimJson);
+
+ Map<String, Double> hybirdSimMap = new HashMap<String, Double>();
+
+ for (String name : sortedAbstractSimMap.keySet()) {
+ hybirdSimMap.put(name, sortedAbstractSimMap.get(name) /** 0.4 */);
+ }
+
+ for (String name : sortedVariableSimMap.keySet()) {
+ if (hybirdSimMap.get(name) != null) {
+ double sim = hybirdSimMap.get(name) + sortedVariableSimMap.get(name) /** 0.3 */;
+ hybirdSimMap.put(name, Double.parseDouble(df.format(sim)));
+ } else {
+ double sim = sortedVariableSimMap.get(name);
+ hybirdSimMap.put(name, Double.parseDouble(df.format(sim)));
+ }
+ }
+
+ for (String name : sortedSessionSimMap.keySet()) {
+ if (hybirdSimMap.get(name) != null) {
+ double sim = hybirdSimMap.get(name) + sortedSessionSimMap.get(name) /** 0.1 */;
+ hybirdSimMap.put(name, Double.parseDouble(df.format(sim)));
+ } else {
+ double sim = sortedSessionSimMap.get(name);
+ hybirdSimMap.put(name, Double.parseDouble(df.format(sim)));
+ }
+ }
+
+ Map<String, Double> sortedHybirdSimMap = this.sortMapByValue(hybirdSimMap);
+
+ JsonElement linkedJson = mapToJson(sortedHybirdSimMap, num);
+ resultJson.add("linked", linkedJson);
+
+ return resultJson;
+ }
+
+ /**
+ * Method of converting hashmap to JSON
+ *
+ * @param wordweights a map from related metadata to weights
+ * @param num the number of converted elements
+ * @return converted JSON object
+ */
+ protected JsonElement mapToJson(Map<String, Double> wordweights, int num) {
+ Gson gson = new Gson();
+
+ List<JsonObject> nodes = new ArrayList<>();
+ Set<String> words = wordweights.keySet();
+ int i = 0;
+ for (String wordB : words) {
+ JsonObject node = new JsonObject();
+ node.addProperty("name", wordB);
+ node.addProperty("weight", wordweights.get(wordB));
+ nodes.add(node);
+
+ i += 1;
+ if (i >= num) {
+ break;
+ }
+ }
+
+ String nodesJson = gson.toJson(nodes);
+ JsonElement nodesElement = gson.fromJson(nodesJson, JsonElement.class);
+
+ return nodesElement;
+ }
+
+ /**
+ * Get recommend dataset for a giving dataset
+ *
+ * @param type recommend method
+ * @param input a giving dataset
+ * @param num the number of recommended dataset
+ * @return recommended dataset map, key is dataset name, value is similarity
+ * value
+ */
+ public Map<String, Double> getRelatedData(String type, String input, int num) {
+ termList = new ArrayList<>();
+ Map<String, Double> termsMap = new HashMap<>();
+ Map<String, Double> sortedMap = new HashMap<>();
+ try {
+ List<LinkedTerm> links = getRelatedDataFromES(type, input, num);
+ int size = links.size();
+ for (int i = 0; i < size; i++) {
+ termsMap.put(links.get(i).term, links.get(i).weight);
+ }
+
+ sortedMap = sortMapByValue(termsMap); // terms_map will be empty
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return sortedMap;
+ }
+
+ /**
+ * Get recommend dataset for a giving dataset
+ *
+ * @param type recommend method
+ * @param input a giving dataset
+ * @param num the number of recommended dataset
+ * @return recommended dataset list
+ */
+ public List<LinkedTerm> getRelatedDataFromES(String type, String input, int num) {
+
+ SearchRequestBuilder builder = es.getClient().prepareSearch(props.getProperty(INDEX_NAME)).setTypes(type).setQuery(QueryBuilders.termQuery("concept_A", input)).addSort(WEIGHT, SortOrder.DESC)
+ .setSize(num);
+
+ SearchResponse usrhis = builder.execute().actionGet();
+
+ for (SearchHit hit : usrhis.getHits().getHits()) {
+ Map<String, Object> result = hit.getSource();
+ String conceptB = (String) result.get("concept_B");
+
+ if (!conceptB.equals(input)) {
+ LinkedTerm lTerm = new LinkedTerm(conceptB, (double) result.get(WEIGHT), type);
+ termList.add(lTerm);
+ }
+ }
+
+ return termList;
+ }
+
+ /**
+ * Method of sorting a map by value
+ *
+ * @param passedMap input map
+ * @return sorted map
+ */
+ public Map<String, Double> sortMapByValue(Map<String, Double> passedMap) {
+ List<String> mapKeys = new ArrayList<>(passedMap.keySet());
+ List<Double> mapValues = new ArrayList<>(passedMap.values());
+ Collections.sort(mapValues, Collections.reverseOrder());
+ Collections.sort(mapKeys, Collections.reverseOrder());
+
+ LinkedHashMap<String, Double> sortedMap = new LinkedHashMap<>();
+
+ Iterator<Double> valueIt = mapValues.iterator();
+ while (valueIt.hasNext()) {
+ Object val = valueIt.next();
+ Iterator<String> keyIt = mapKeys.iterator();
+
+ while (keyIt.hasNext()) {
+ Object key = keyIt.next();
+ String comp1 = passedMap.get(key).toString();
+ String comp2 = val.toString();
+
+ if (comp1.equals(comp2)) {
+ passedMap.remove(key);
+ mapKeys.remove(key);
+ sortedMap.put((String) key, (Double) val);
+ break;
+ }
+ }
+ }
+ return sortedMap;
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ MudrodEngine me = new MudrodEngine();
+ Properties props = me.loadConfig();
+ ESDriver es = new ESDriver(me.getConfig());
+ HybridRecommendation test = new HybridRecommendation(props, es, null);
+
+ // String input = "NSCAT_LEVEL_1.7_V2";
+ String input = "AQUARIUS_L3_SSS_SMIA_MONTHLY-CLIMATOLOGY_V4";
+ JsonObject json = test.getRecomDataInJson(input, 10);
+
+ System.out.println(json.toString());
+ }
+}