You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sdap.apache.org by le...@apache.org on 2017/12/19 14:13:16 UTC
[15/17] incubator-sdap-mudrod git commit: SDAP-7 Change all package
namespaces to org.apache.sdap
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java b/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java
deleted file mode 100644
index a79ca87..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.metadata.structure;
-
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.function.PairFunction;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.common.unit.TimeValue;
-import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.search.SearchHit;
-import scala.Tuple2;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ExecutionException;
-
-public class MetadataExtractor implements Serializable {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- public MetadataExtractor() {
- }
-
- /**
- * loadMetadata:Load all metadata from Elasticsearch and convert them to
- * pairRDD Please make sure metadata has been already harvested from web
- * service and stored in Elasticsearch.
- *
- * @param es an Elasticsearch client node instance
- * @param sc spark context
- * @param index index name of log processing application
- * @param type metadata type name
- * @return PairRDD, in each pair key is metadata short name and value is term
- * list extracted from metadata variables.
- */
- public JavaPairRDD<String, List<String>> loadMetadata(ESDriver es, JavaSparkContext sc, String index, String type) {
- List<PODAACMetadata> metadatas = this.loadMetadataFromES(es, index, type);
- JavaPairRDD<String, List<String>> metadataTermsRDD = this.buildMetadataRDD(es, sc, index, metadatas);
- return metadataTermsRDD;
- }
-
- /**
- * loadMetadataFromES: Load all metadata from Elasticsearch.
- *
- * @param es an Elasticsearch client node instance
- * @param index index name of log processing application
- * @param type metadata type name
- * @return metadata list
- */
- protected List<PODAACMetadata> loadMetadataFromES(ESDriver es, String index, String type) {
-
- List<PODAACMetadata> metadatas = new ArrayList<PODAACMetadata>();
- SearchResponse scrollResp = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.matchAllQuery()).setScroll(new TimeValue(60000)).setSize(100).execute().actionGet();
-
- while (true) {
- for (SearchHit hit : scrollResp.getHits().getHits()) {
- Map<String, Object> result = hit.getSource();
- String shortname = (String) result.get("Dataset-ShortName");
- List<String> topic = (List<String>) result.get("DatasetParameter-Topic");
- List<String> term = (List<String>) result.get("DatasetParameter-Term");
- List<String> keyword = (List<String>) result.get("Dataset-Metadata");
- List<String> variable = (List<String>) result.get("DatasetParameter-Variable");
- List<String> longname = (List<String>) result.get("DatasetProject-Project-LongName");
-
- List<String> region = (List<String>) result.get("DatasetRegion-Region");
-
- PODAACMetadata metadata = null;
- try {
- metadata = new PODAACMetadata(shortname, longname, es.customAnalyzing(index, topic), es.customAnalyzing(index, term), es.customAnalyzing(index, variable), es.customAnalyzing(index, keyword),
- es.customAnalyzing(index, region));
- } catch (InterruptedException | ExecutionException e) {
- e.printStackTrace();
-
- }
- metadatas.add(metadata);
- }
- scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
- if (scrollResp.getHits().getHits().length == 0) {
- break;
- }
- }
-
- return metadatas;
- }
-
- /**
- * buildMetadataRDD: Convert metadata list to JavaPairRDD
- *
- * @param es an Elasticsearch client node instance
- * @param sc spark context
- * @param index index name of log processing application
- * @param metadatas metadata list
- * @return PairRDD, in each pair key is metadata short name and value is term
- * list extracted from metadata variables.
- */
- protected JavaPairRDD<String, List<String>> buildMetadataRDD(ESDriver es, JavaSparkContext sc, String index, List<PODAACMetadata> metadatas) {
- JavaRDD<PODAACMetadata> metadataRDD = sc.parallelize(metadatas);
- JavaPairRDD<String, List<String>> metadataTermsRDD = metadataRDD.mapToPair(new PairFunction<PODAACMetadata, String, List<String>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, List<String>> call(PODAACMetadata metadata) throws Exception {
- return new Tuple2<String, List<String>>(metadata.getShortName(), metadata.getAllTermList());
- }
- }).reduceByKey(new Function2<List<String>, List<String>, List<String>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public List<String> call(List<String> v1, List<String> v2) throws Exception {
- List<String> list = new ArrayList<String>();
- list.addAll(v1);
- list.addAll(v2);
- return list;
- }
- });
-
- return metadataTermsRDD;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java b/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java
deleted file mode 100644
index 50b17c0..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.metadata.structure;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * ClassName: PODAACMetadata Function: PODAACMetadata setter and getter methods
- */
-public class PODAACMetadata implements Serializable {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- // shortname: data set short name
- private String shortname;
- // abstractStr: data set abstract
- private String abstractStr;
- // isoTopic: data set topic
- private String isoTopic;
- // sensor: sensor
- private String sensor;
- // source: data source
- private String source;
- // project: data project
- private String project;
- // hasAbstarct: whether data set has abstract
- boolean hasAbstarct;
-
- // longnameList: data set long name list
- private List<String> longnameList;
- // keywordList:data set key word list
- private List<String> keywordList;
- // termList: data set term list
- private List<String> termList;
- // topicList: data set topic list
- private List<String> topicList;
- // variableList: data set variable list
- private List<String> variableList;
- // abstractList: data set abstract term list
- private List<String> abstractList;
- // isotopicList: data set iso topic list
- private List<String> isotopicList;
- // sensorList: data set sensor list
- private List<String> sensorList;
- // sourceList: data set source list
- private List<String> sourceList;
- // projectList: data set project list
- private List<String> projectList;
- // regionList: data set region list
- private List<String> regionList;
-
- public PODAACMetadata() {
- // Default constructor
- }
-
- /**
- * Creates a new instance of PODAACMetadata.
- *
- * @param shortname data set short name
- * @param longname data set long name
- * @param topics data set topics
- * @param terms data set terms
- * @param variables data set variables
- * @param keywords data set keywords
- * @param region list of regions
- */
- public PODAACMetadata(String shortname, List<String> longname, List<String> topics, List<String> terms, List<String> variables, List<String> keywords, List<String> region) {
- this.shortname = shortname;
- this.longnameList = longname;
- this.keywordList = keywords;
- this.termList = terms;
- this.topicList = topics;
- this.variableList = variables;
- this.regionList = region;
- }
-
- /**
- * setTerms: set term of data set
- *
- * @param termstr data set terms
- */
- public void setTerms(String termstr) {
- this.splitString(termstr, this.termList);
- }
-
- /**
- * setKeywords: set key word of data set
- *
- * @param keywords data set keywords
- */
- public void setKeywords(String keywords) {
- this.splitString(keywords, this.keywordList);
- }
-
- /**
- * setTopicList: set topic of data set
- *
- * @param topicStr data set topics
- */
- public void setTopicList(String topicStr) {
- this.splitString(topicStr, this.topicList);
- }
-
- /**
- * setVaraliableList: set varilable of data set
- *
- * @param varilableStr data set variables
- */
- public void setVaraliableList(String varilableStr) {
- this.splitString(varilableStr, this.variableList);
- }
-
- /**
- * setProjectList:set project of data set
- *
- * @param project data set projects
- */
- public void setProjectList(String project) {
- this.splitString(project, this.projectList);
- }
-
- /**
- * setSourceList: set source of data set
- *
- * @param source data set sources
- */
- public void setSourceList(String source) {
- this.splitString(source, this.sourceList);
- }
-
- /**
- * setSensorList: set sensor of data set
- *
- * @param sensor data set sensors
- */
- public void setSensorList(String sensor) {
- this.splitString(sensor, this.sensorList);
- }
-
- /**
- * setISOTopicList:set iso topic of data set
- *
- * @param isoTopic data set iso topics
- */
- public void setISOTopicList(String isoTopic) {
- this.splitString(isoTopic, this.isotopicList);
- }
-
- /**
- * getKeywordList: get key word of data set
- *
- * @return data set keyword list
- */
- public List<String> getKeywordList() {
- return this.keywordList;
- }
-
- /**
- * getTermList:get term list of data set
- *
- * @return data set term list
- */
- public List<String> getTermList() {
- return this.termList;
- }
-
- /**
- * getShortName:get short name of data set
- *
- * @return data set short name
- */
- public String getShortName() {
- return this.shortname;
- }
-
- /**
- * getKeyword:get key word of data set
- *
- * @return data set keyword string
- */
- public String getKeyword() {
- return String.join(",", this.keywordList);
- }
-
- /**
- * getTerm:get term of data set
- *
- * @return data set term string
- */
- public String getTerm() {
- return String.join(",", this.termList);
- }
-
- /**
- * getTopic:get topic of data set
- *
- * @return data set topic string
- */
- public String getTopic() {
- return String.join(",", this.topicList);
- }
-
- /**
- * getVariable:get variable of data set
- *
- * @return data set variable string
- */
- public String getVariable() {
- return String.join(",", this.variableList);
- }
-
- /**
- * getAbstract:get abstract of data set
- *
- * @return data set abstract
- */
- public String getAbstract() {
- return this.abstractStr;
- }
-
- /**
- * getProject:get project of data set
- *
- * @return data set project string
- */
- public String getProject() {
- return this.project;
- }
-
- /**
- * getSource:get source of data set
- *
- * @return data set source string
- */
- public String getSource() {
- return this.source;
- }
-
- /**
- * getSensor:get sensor of data set
- *
- * @return data set sensor string
- */
- public String getSensor() {
- return this.sensor;
- }
-
- /**
- * getISOTopic:get iso topic of data set
- *
- * @return data set ISO topic string
- */
- public String getISOTopic() {
- return this.isoTopic;
- }
-
- /**
- * getAllTermList: get all term list of data set
- *
- * @return data set term list
- */
- public List<String> getAllTermList() {
- List<String> allterms = new ArrayList<>();
-
- if (this.termList != null && !this.termList.isEmpty()) {
- allterms.addAll(this.termList);
- }
-
- if (this.keywordList != null && !this.keywordList.isEmpty()) {
- allterms.addAll(this.keywordList);
- }
-
- if (this.topicList != null && !this.topicList.isEmpty()) {
- allterms.addAll(this.topicList);
- }
-
- if (this.variableList != null && !this.variableList.isEmpty()) {
- allterms.addAll(this.variableList);
- }
-
- if (this.regionList != null && !this.regionList.isEmpty()) {
- allterms.addAll(this.regionList);
- }
- return allterms;
- }
-
- /**
- * splitString: split value of fields of data set
- *
- * @param oristr original string
- * @param list result after splitting
- */
- private void splitString(String oristr, List<String> list) {
- if (oristr == null) {
- return;
- }
-
- if (oristr.startsWith("\"")) {
- oristr = oristr.substring(1);
- }
- if (oristr.endsWith("\"")) {
- oristr = oristr.substring(0, oristr.length() - 1);
- }
-
- String strs[] = oristr.trim().split(",");
- if (strs != null) {
- for (int i = 0; i < strs.length; i++) {
- String str = strs[i].trim();
- if (str.startsWith(",") || str.startsWith("\"")) {
- str = str.substring(1);
- }
- if (str.endsWith(",") || str.endsWith("\"")) {
- str = str.substring(0, str.length() - 1);
- }
- if (str == "") {
- continue;
- }
- list.add(str);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java
deleted file mode 100644
index d7de65d..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes classes needed for metadata analysis
- */
-package gov.nasa.jpl.mudrod.metadata.structure;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java
deleted file mode 100644
index 7bc76fb..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology;
-
-import java.util.Iterator;
-
-/**
- * Base class for working with ontologies. Methods indicate ability
- * to load, merge e.g. merge relevant ontology subgraphs into a new
- * subgraph which can be used within Mudrod, subclass retreival,
- * synonym expansion, etc.
- *
- * @author lewismc
- */
-public interface Ontology {
-
- /**
- * Load an array URIs which resolve to ontology resources.
- *
- * @param urls a {@link java.lang.String} containing ontology URIs.
- */
- public void load(String[] urls);
-
- /**
- * Load a collection of default ontology resources.
- */
- public void load() ;
-
- /**
- * merge relevant ontology subgraphs into a new subgraph which can
- * be used within Mudrod
- *
- * @param o an ontology to merge with the current ontology held
- * within Mudrod.
- */
- public void merge(Ontology o);
-
- /**
- * Retreive all subclasses for a particular entity provided within the
- * search term e.g.subclass-based query expansion.
- *
- * @param entitySearchTerm an input search term
- * @return an {@link java.util.Iterator} object containing subClass entries.
- */
- public Iterator<String> subclasses(String entitySearchTerm);
-
- /**
- * Retreive all synonyms for a particular entity provided within the
- * search term e.g.synonym-based query expansion.
- *
- * @param queryKeyPhrase a phrase to undertake synonym expansion on.
- * @return an {@link java.util.Iterator} object containing synonym entries.
- */
- public Iterator<String> synonyms(String queryKeyPhrase);
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java
deleted file mode 100644
index f0ef6cd..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology;
-
-import gov.nasa.jpl.mudrod.main.MudrodConstants;
-import gov.nasa.jpl.mudrod.ontology.process.EsipCOROntology;
-import gov.nasa.jpl.mudrod.ontology.process.EsipPortalOntology;
-import gov.nasa.jpl.mudrod.ontology.process.LocalOntology;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Properties;
-
-/**
- * The mechanism for creating an {@link Ontology}
- * implementation. The {@link Ontology} implementation
- * should be specified in
- * <a href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml">
- * config.xml</a> with configuration key
- * <code>mudrod.ontology.implementation</code>.
- * This property can also be accessed via
- * {@link MudrodConstants#ONTOLOGY_IMPL}.
- *
- * @author lewismc
- */
-public class OntologyFactory {
-
- public static final Logger LOG = LoggerFactory.getLogger(OntologyFactory.class);
-
- private Properties props;
-
- /**
- * The mechanism for creating an {@link Ontology}
- * implementation.
- *
- * @param props a populated Mudrod {@link java.util.Properties} object.
- */
- public OntologyFactory(Properties props) {
- this.props = props;
- }
-
- /**
- * Obtain the {@link Ontology}
- * implementation for use within Mudrod.
- *
- * @return Returns the ontology implementation specified
- * in <a href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml">
- * config.xml</a> with configuration key
- * <code>mudrod.ontology.implementation</code>. This property can also be accessed via
- * {@link MudrodConstants#ONTOLOGY_IMPL}.
- */
- public Ontology getOntology() {
-
- String ontologyImpl = this.props.getProperty(MudrodConstants.ONTOLOGY_IMPL, "Local");
-
- LOG.info("Using ontology extension: {}", ontologyImpl);
- Ontology ontImpl;
- switch (ontologyImpl) {
- case "EsipCOR":
- ontImpl = new EsipCOROntology();
- break;
- case "EsipPortal":
- ontImpl = new EsipPortalOntology();
- break;
- default:
- ontImpl = new LocalOntology();
- break;
- }
- return ontImpl;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java
deleted file mode 100644
index 3763634..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes ontology pre-processing and processing classes.
- */
-package gov.nasa.jpl.mudrod.ontology;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java
deleted file mode 100644
index 99de87d..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.pre;
-
-import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import org.apache.commons.io.FilenameUtils;
-import org.jdom2.Document;
-import org.jdom2.Element;
-import org.jdom2.JDOMException;
-import org.jdom2.Namespace;
-import org.jdom2.filter.ElementFilter;
-import org.jdom2.input.SAXBuilder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Properties;
-
-/**
- * Supports ability to extract triples (subclassOf, equivalent class) from OWL file
- */
-public class AggregateTriples extends DiscoveryStepAbstract {
- private static final long serialVersionUID = 1L;
- private static final Logger LOG = LoggerFactory.getLogger(AggregateTriples.class);
-
- public AggregateTriples(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- }
-
- /**
- * Method of executing triple aggregation
- */
- @Override
- public Object execute() {
- File file = new File(this.props.getProperty("oceanTriples"));
- if (file.exists()) {
- file.delete();
- }
- try {
- file.createNewFile();
- } catch (IOException e2) {
- e2.printStackTrace();
- }
-
- FileWriter fw;
- try {
- fw = new FileWriter(file.getAbsoluteFile());
- bw = new BufferedWriter(fw);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- File[] files = new File(this.props.getProperty("ontologyInputDir")).listFiles();
- for (File file_in : files) {
- String ext = FilenameUtils.getExtension(file_in.getAbsolutePath());
- if ("owl".equals(ext)) {
- try {
- loadxml(file_in.getAbsolutePath());
- getAllClass();
- } catch (JDOMException e1) {
- e1.printStackTrace();
- } catch (IOException e1) {
- e1.printStackTrace();
- }
-
- }
- }
-
- try {
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
-
- public Document document;
- public Element rootNode = null;
- final static String owl_namespace = "http://www.w3.org/2002/07/owl#";
- final static String rdf_namespace = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
- final static String rdfs_namespace = "http://www.w3.org/2000/01/rdf-schema#";
-
- BufferedWriter bw = null;
-
- /**
- * Load OWL file into memory
- *
- * @param filePathName local path of OWL file
- * @throws JDOMException JDOMException
- * @throws IOException IOException
- */
- public void loadxml(String filePathName) throws JDOMException, IOException {
- SAXBuilder saxBuilder = new SAXBuilder();
- File file = new File(filePathName);
-
- document = saxBuilder.build(file);
- rootNode = document.getRootElement();
- }
-
- /**
- * Method of going through OWL structure
- */
- public void loopxml() {
- Iterator<?> processDescendants = rootNode.getDescendants(new ElementFilter());
- String text = "";
-
- while (processDescendants.hasNext()) {
- Element e = (Element) processDescendants.next();
- String currentName = e.getName();
- text = e.getTextTrim();
- if ("".equals(text)) {
- LOG.info(currentName);
- } else {
- LOG.info("{} : {}", currentName, text);
- }
- }
- }
-
- /**
- * Method of identifying a specific child given a element name
- *
- * @param str element name
- * @param ele parent element
- * @return the element of child
- */
- public Element findChild(String str, Element ele) {
- Iterator<?> processDescendants = ele.getDescendants(new ElementFilter());
- String name = "";
- Element result = null;
-
- while (processDescendants.hasNext()) {
- Element e = (Element) processDescendants.next();
- name = e.getName();
- if (name.equals(str)) {
- result = e;
- return result;
- }
- }
- return result;
-
- }
-
- /**
- * Method of extract triples (subclassOf, equivalent class) from OWL file
- *
- * @throws IOException IOException
- */
- public void getAllClass() throws IOException {
- List<?> classElements = rootNode.getChildren("Class", Namespace.getNamespace("owl", owl_namespace));
-
- for (int i = 0; i < classElements.size(); i++) {
- Element classElement = (Element) classElements.get(i);
- String className = classElement.getAttributeValue("about", Namespace.getNamespace("rdf", rdf_namespace));
-
- if (className == null) {
- className = classElement.getAttributeValue("ID", Namespace.getNamespace("rdf", rdf_namespace));
- }
-
- List<?> subclassElements = classElement.getChildren("subClassOf", Namespace.getNamespace("rdfs", rdfs_namespace));
- for (int j = 0; j < subclassElements.size(); j++) {
- Element subclassElement = (Element) subclassElements.get(j);
- String subclassName = subclassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace));
- if (subclassName == null) {
- Element allValuesFromEle = findChild("allValuesFrom", subclassElement);
- if (allValuesFromEle != null) {
- subclassName = allValuesFromEle.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace));
- bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n");
- }
- } else {
- bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n");
- }
-
- }
-
- List equalClassElements = classElement.getChildren("equivalentClass", Namespace.getNamespace("owl", owl_namespace));
- for (int k = 0; k < equalClassElements.size(); k++) {
- Element equalClassElement = (Element) equalClassElements.get(k);
- String equalClassElementName = equalClassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace));
-
- if (equalClassElementName != null) {
- bw.write(cutString(className) + ",equivalentClass," + cutString(equalClassElementName) + "\n");
- }
- }
-
- }
- }
-
- /**
- * Method of cleaning up a string
- *
- * @param str String needed to be processed
- * @return the processed string
- */
- public String cutString(String str) {
- str = str.substring(str.indexOf("#") + 1);
- String[] strArray = str.split("(?=[A-Z])");
- str = Arrays.toString(strArray);
- return str.substring(1, str.length() - 1).replace(",", "");
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java
deleted file mode 100644
index 0570bc7..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes ontology pre-processing classes.
- */
-package gov.nasa.jpl.mudrod.ontology.pre;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java
deleted file mode 100644
index 6194197..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
-
-import gov.nasa.jpl.mudrod.ontology.Ontology;
-
-import java.util.Iterator;
-
-/**
- * @author lewismc
- */
-public class EsipCOROntology implements Ontology {
-
- /**
- *
- */
- public EsipCOROntology() {
- //default constructor
- }
-
- @Override
- public void load() {
- // to be completed
- }
-
- /* (non-Javadoc)
- * @see Ontology#load(java.lang.String[])
- */
- @Override
- public void load(String[] urls) {
- // to be completed
- }
-
- /* (non-Javadoc)
- * @see Ontology#merge(Ontology)
- */
- @Override
- public void merge(Ontology o) {
- // to be completed
- }
-
- /* (non-Javadoc)
- * @see Ontology#subclasses(java.lang.String)
- */
- @Override
- public Iterator<String> subclasses(String entitySearchTerm) {
- return null;
- }
-
- /* (non-Javadoc)
- * @see Ontology#synonyms(java.lang.String)
- */
- @Override
- public Iterator<String> synonyms(String queryKeyPhrase) {
- return null;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java
deleted file mode 100644
index 9c4888b..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
-
-import gov.nasa.jpl.mudrod.ontology.Ontology;
-
-import java.util.Iterator;
-
-/**
- * @author lewismc
- */
-public class EsipPortalOntology implements Ontology {
-
- /**
- *
- */
- public EsipPortalOntology() {
- //default constructor
- }
-
- /* (non-Javadoc)
- * @see Ontology#load(java.lang.String[])
- */
- @Override
- public void load(String[] urls) {
- // to be completed
- }
-
- /* (non-Javadoc)
- * @see Ontology#load()
- */
- @Override
- public void load() {
- // to be completed
- }
-
- /* (non-Javadoc)
- * @see Ontology#merge(Ontology)
- */
- @Override
- public void merge(Ontology o) {
- // to be completed
- }
-
- /* (non-Javadoc)
- * @see Ontology#subclasses(java.lang.String)
- */
- @Override
- public Iterator<String> subclasses(String entitySearchTerm) {
- return null;
- }
-
- /* (non-Javadoc)
- * @see Ontology#synonyms(java.lang.String)
- */
- @Override
- public Iterator<String> synonyms(String queryKeyPhrase) {
- return null;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java
deleted file mode 100644
index 55ca51d..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
-
-import gov.nasa.jpl.mudrod.ontology.Ontology;
-
-import org.apache.jena.ontology.Individual;
-import org.apache.jena.ontology.OntClass;
-import org.apache.jena.ontology.OntModel;
-import org.apache.jena.ontology.OntModelSpec;
-import org.apache.jena.ontology.OntResource;
-import org.apache.jena.ontology.Restriction;
-import org.apache.jena.rdf.model.AnonId;
-import org.apache.jena.rdf.model.Literal;
-import org.apache.jena.rdf.model.ModelFactory;
-import org.apache.jena.rdf.model.Resource;
-import org.apache.jena.shared.PrefixMapping;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.PrintStream;
-import java.net.MalformedURLException;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * The LocalOntology implementation enables us to work with Ontology files
- * whcih are cached locally and available on the runtime classpath e.g.
- * in <code>src/main/resource/ontology/...</code>.
- * From here we can test and iterate on how use of ontology can enhance search.
- */
-public class LocalOntology implements Ontology {
-
- public static final Logger LOG = LoggerFactory.getLogger(LocalOntology.class);
-
- public static final String DELIMITER_SEARCHTERM = " ";
-
- private Map<Object, Object> searchTerms = new HashMap<>();
- private static OntologyParser parser;
- private static OntModel ontologyModel;
- private Ontology ontology;
- private static Map<AnonId, String> mAnonIDs = new HashMap<>();
- private static int mAnonCount = 0;
- private List<String> ontArrayList;
-
- public LocalOntology() {
- //only initialize all the static variables
- //if first time called to this ontology constructor
- if (ontology == null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Creating new ontology");
- }
- parser = new OwlParser();
- ontology = this;
- }
- if (ontologyModel == null)
- ontologyModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, null);
- load();
- }
-
- /**
- * Static accessor for {@link LocalOntology}
- * instance implementation defined within <code>config.xml</code>.
- *
- * @return a {@link LocalOntology}
- */
- public Ontology getInstance() {
- if (ontology == null) {
- ontology = new LocalOntology();
- }
- return ontology;
- }
-
- /**
- * Load the default <i>sweetAll.owl</i> ontology
- * from <a href="https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl">
- * https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl</a>
- */
- @Override
- public void load() {
- URL ontURL = null;
- try {
- ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl");
- //ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/reprDataProduct.owl");
- } catch (MalformedURLException e) {
- LOG.error("Error when attempting to create URL resource: ", e);
- }
- ontArrayList = new ArrayList<>();
- try {
- ontArrayList.add(ontURL.toURI().toString());
- } catch (URISyntaxException e) {
- LOG.error("Error in URL syntax, please check your Ontology resource: ", e);
- }
- if (!ontArrayList.isEmpty()) {
- load(ontArrayList.stream().toArray(String[]::new));
- }
- }
-
- /**
- * Load a string array of local URIs which refernece .owl files.
- */
- @Override
- public void load(String[] urls) {
- for (int i = 0; i < urls.length; i++) {
- String url = urls[i].trim();
- if (!"".equals(url))
- if (LOG.isInfoEnabled()) {
- LOG.info("Reading and processing {}", url);
- }
- load(ontologyModel, url);
- }
- parser.parse(ontology, ontologyModel);
- }
-
- private void load(Object m, String url) {
- try {
- ((OntModel) m).read(url, null, null);
- LOG.info("Successfully processed {}", url);
- } catch (Exception e) {
- LOG.error("Failed whilst attempting to read ontology {}: Error: ", url, e);
- }
- }
-
- /**
- * Get the {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser}
- * implementation being used to process the input ontology resources.
- * @return an {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser} implementation
- */
- public OntologyParser getParser() {
- if (parser == null) {
- parser = new OwlParser();
- }
- return parser;
- }
-
- /**
- * Return the {@link org.apache.jena.ontology.OntModel} instance
- * which created from input ontology resources.
- * @return a constructed {@link org.apache.jena.ontology.OntModel}
- */
- public static OntModel getModel() {
- return ontologyModel;
- }
-
- /**
- * Return the loaded Ontology resources.
- * @return a {@link java.util.List} of resources.
- */
- public List<String> getLoadedOntologyResources() {
- if (ontArrayList != null) {
- return ontArrayList;
- } else {
- return new ArrayList<>();
- }
- }
- /**
- * Not yet implemented.
- */
- @Override
- public void merge(Ontology o) {
- // not yet implemented
- }
-
- /**
- * Retrieve all subclasses of entity(ies) hashed to searchTerm
- * @param entitySearchTerm a query (keywords) for which to obtain
- * subclasses.
- * @return an {@link java.util.Iterator} containing the subclass as Strings.
- */
- @Override
- public Iterator<String> subclasses(String entitySearchTerm) {
- Map<OntResource, String> classMap = retrieve(entitySearchTerm);
- Map<String, String> subclasses = new HashMap<>();
-
- Iterator<OntResource> iter = classMap.keySet().iterator();
- while (iter.hasNext()) {
- OntResource resource = iter.next();
-
- if (resource instanceof OntClass) {
- //get subclasses N.B. we only get direct sub-classes e.g. direct children
- //it is possible for us to navigate the entire class tree if we wish, we simply
- //need to pass the .listSubClasses(true) boolean parameter.
- for (Iterator<?> i = ((OntClass) resource).listSubClasses(); i.hasNext();) {
- OntResource subclass = (OntResource) i.next();
- for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
- Literal l = (Literal) j.next();
- subclasses.put(l.toString(), "1");
- }
- }
- //get individuals
- for (Iterator<?> i = ((OntClass) resource).listInstances(); i.hasNext(); ) {
- OntResource subclass = (OntResource) i.next();
- for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
- Literal l = (Literal) j.next();
- subclasses.put(l.toString(), "1");
- }
- }
- } else if (resource instanceof Individual) {
- for (Iterator<?> i = resource.listSameAs(); i.hasNext();) {
- OntResource subclass = (OntResource) i.next();
- for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
- Literal l = (Literal) j.next();
- subclasses.put(l.toString(), "1");
- }
- }
- }
- }
- return subclasses.keySet().iterator();
- }
-
- /**
- * Retreives synonyms for an given phrase if the phrase
- * is present in the ontology
- * @param queryKeyPhrase an input string representing a phrase
- * for which we wish to obtain synonyms.
- * @return an {@link java.util.Iterator} containing synonyms string tokens
- * or an empty if no synonyms exist for the given queryKeyPhrase.
- */
- @Override
- public Iterator synonyms(String queryKeyPhrase) {
-
- Map<?, ?> classMap = retrieve(queryKeyPhrase);
-
- Map<Object, Object> synonyms = new HashMap<>();
-
- Iterator<?> iter = classMap.keySet().iterator();
- while (iter.hasNext()) {
- OntResource resource = (OntResource) iter.next();
-
- //listLabels
- for (Iterator<?> i = resource.listLabels(null); i.hasNext();) {
- Literal l = (Literal) i.next();
- synonyms.put(l.toString(), "1");
- }
-
- if (resource instanceof Individual) {
- //get all individuals same as this one
- for (Iterator<?> i = resource.listSameAs(); i.hasNext();) {
- Individual individual = (Individual) i.next();
- //add labels
- for (Iterator<?> j = individual.listLabels(null); j.hasNext();) {
- Literal l = (Literal) i.next();
- synonyms.put(l.toString(), "1");
- }
- }
- } else if (resource instanceof OntClass) {
- //list equivalent classes
- for (Iterator<?> i = ((OntClass) resource).listEquivalentClasses(); i.hasNext();) {
- OntClass equivClass = (OntClass) i.next();
- //add labels
- for (Iterator<?> j = equivClass.listLabels(null); j.hasNext();) {
- Literal l = (Literal) j.next();
- synonyms.put(l.toString(), "1");
- }
- }
- }
- }
-
- return synonyms.keySet().iterator();
- }
-
- public void addSearchTerm(String label, OntResource resource) {
- Map<OntResource, String> m = retrieve(label);
- if (m == null) {
- m = new HashMap<>();
- }
- m.put(resource, "1");
- searchTerms.put(label.toLowerCase(), m);
- }
-
- /**
- * A basic lookup function for retrieving keys (phrases or tokens)
- * from the ontology search terms map. Right now only exact lookups
- * will retrieve a result... this could be improved by using some
- * advanced parsing logic... such as Lucene query parser.
- * @param label the label (phrases or tokens) to retrieve from the
- * ontology search terms map.
- * @return an {@link java.util.Map} if there are match(es)
- * or an empty {@link java.util.HashMap} if there are no
- * matches.
- */
- public Map<OntResource, String> retrieve(String label) {
- @SuppressWarnings("unchecked")
- Map<OntResource, String> m = (Map<OntResource, String>) searchTerms.get(label.toLowerCase());
- if (m == null) {
- m = new HashMap<>();
- }
- return m;
- }
-
- protected static void renderHierarchy(PrintStream out, OntClass cls, List<Object> occurs, int depth) {
- renderClassDescription(out, cls, depth);
- out.println();
-
- // recurse to the next level down
- if (cls.canAs(OntClass.class) && !occurs.contains(cls)) {
- for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) {
- OntClass sub = (OntClass) i.next();
-
- // we push this expression on the occurs list before we recurse
- occurs.add(cls);
- renderHierarchy(out, sub, occurs, depth + 1);
- occurs.remove(cls);
- }
- for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) {
- Individual individual = (Individual) i.next();
- renderURI(out, individual.getModel(), individual.getURI());
- out.print(" [");
- for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) {
- out.print(((Literal) j.next()).getString() + ", ");
- }
- out.print("] ");
- out.println();
- }
- }
- }
-
- public static void renderClassDescription(PrintStream out, OntClass c, int depth) {
- indent(out, depth);
-
- if (c.isRestriction()) {
- renderRestriction(out, (Restriction) c.as(Restriction.class));
- } else {
- if (!c.isAnon()) {
- out.print("Class ");
- renderURI(out, c.getModel(), c.getURI());
-
- out.print(c.getLocalName());
-
- out.print(" [");
- for (Iterator<?> i = c.listLabels(null); i.hasNext(); ) {
- out.print(((Literal) i.next()).getString() + ", ");
- }
- out.print("] ");
- } else {
- renderAnonymous(out, c, "class");
- }
- }
- }
-
- protected static void renderRestriction(PrintStream out, Restriction r) {
- if (!r.isAnon()) {
- out.print("Restriction ");
- renderURI(out, r.getModel(), r.getURI());
- } else {
- renderAnonymous(out, r, "restriction");
- }
-
- out.print(" on property ");
- renderURI(out, r.getModel(), r.getOnProperty().getURI());
- }
-
- protected static void renderURI(PrintStream out, PrefixMapping prefixes, String uri) {
- out.print(prefixes.expandPrefix(uri));
- }
-
- protected static void renderAnonymous(PrintStream out, Resource anon, String name) {
- String anonID = mAnonIDs.get(anon.getId());
- if (anonID == null) {
- anonID = "a-" + mAnonCount++;
- mAnonIDs.put(anon.getId(), anonID);
- }
-
- out.print("Anonymous ");
- out.print(name);
- out.print(" with ID ");
- out.print(anonID);
- }
-
- protected static void indent(PrintStream out, int depth) {
- for (int i = 0; i < depth; i++) {
- out.print(" ");
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java
deleted file mode 100644
index a68a0cb..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
-
-import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import org.elasticsearch.action.index.IndexRequest;
-import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.index.query.QueryBuilders;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.Properties;
-import java.util.concurrent.ExecutionException;
-
-import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
-
-/**
- * Supports ability to parse and process FTP and HTTP log files
- */
-public class OntologyLinkCal extends DiscoveryStepAbstract {
-
- public OntologyLinkCal(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- es.deleteAllByQuery(props.getProperty("indexName"), props.getProperty("ontologyLinkageType"), QueryBuilders.matchAllQuery());
- addSWEETMapping();
- }
-
- /**
- * Method of adding mapping for triples extracted from SWEET
- */
- public void addSWEETMapping() {
- XContentBuilder Mapping;
- try {
- Mapping = jsonBuilder().startObject().startObject(props.getProperty("ontologyLinkageType")).startObject("properties").startObject("concept_A").field("type", "string")
- .field("index", "not_analyzed").endObject().startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject()
-
- .endObject().endObject().endObject();
-
- es.getClient().admin().indices().preparePutMapping(props.getProperty("indexName")).setType(props.getProperty("ontologyLinkageType")).setSource(Mapping).execute().actionGet();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Method of calculating and importing SWEET triples into Elasticsearch
- */
- @Override
- public Object execute() {
- es.deleteType(props.getProperty("indexName"), props.getProperty("ontologyLinkageType"));
- es.createBulkProcessor();
-
- BufferedReader br = null;
- String line = "";
- double weight = 0;
-
- try {
- br = new BufferedReader(new FileReader(props.getProperty("oceanTriples")));
- while ((line = br.readLine()) != null) {
- String[] strList = line.toLowerCase().split(",");
- if (strList[1].equals("subclassof")) {
- weight = 0.75;
- } else {
- weight = 0.9;
- }
-
- IndexRequest ir = new IndexRequest(props.getProperty("indexName"), props.getProperty("ontologyLinkageType")).source(
- jsonBuilder().startObject().field("concept_A", es.customAnalyzing(props.getProperty("indexName"), strList[2]))
- .field("concept_B", es.customAnalyzing(props.getProperty("indexName"), strList[0])).field("weight", weight).endObject());
- es.getBulkProcessor().add(ir);
-
- }
-
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- } catch (ExecutionException e) {
- e.printStackTrace();
- } finally {
- if (br != null) {
- try {
- br.close();
- es.destroyBulkProcessor();
- es.refreshIndex();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- return null;
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java
deleted file mode 100644
index eca6252..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
-
-import org.apache.jena.ontology.OntClass;
-import org.apache.jena.ontology.OntModel;
-
-import gov.nasa.jpl.mudrod.ontology.Ontology;
-
-import java.util.Iterator;
-
-/**
- * Interface for specific ontology parsers e.g. .ttl, RDFXML,
- * etc.
- */
-public interface OntologyParser {
-
- /**
- * An ontology model (RDF graph) to parse for literals.
- *
- * @param ont the associated {@link gov.nasa.jpl.mudrod.ontology.Ontology}
- * implementation processing the ontology operation(s).
- * @param ontModel the {@link org.apache.jena.ontology.OntModel}
- */
- public void parse(Ontology ont, OntModel ontModel);
-
- /**
- * An ontology model (RDF graph) for which to obtain an
- * {@link java.util.Iterator} instance of all root classes.
- *
- * @param ontModel the {@link org.apache.jena.ontology.OntModel}
- * @return an {@link java.util.Iterator} instance containing all root classes.
- */
- public Iterator<OntClass> rootClasses(OntModel ontModel);
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java
deleted file mode 100644
index e43f04d..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
-
-import org.apache.jena.ontology.Individual;
-import org.apache.jena.ontology.OntClass;
-import org.apache.jena.ontology.OntModel;
-import org.apache.jena.rdf.model.Literal;
-
-import com.esotericsoftware.minlog.Log;
-
-import gov.nasa.jpl.mudrod.ontology.Ontology;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser}
- * implementation for <a href="http://www.w3.org/TR/owl-features/">W3C OWL</a>
- * files.
- */
-public class OwlParser implements OntologyParser {
-
- private Ontology ont;
- private List<OntClass> roots = new ArrayList<>();
-
- public OwlParser() {
- //default constructor
- }
-
- /**
- * Parse OWL ontology files using Apache Jena
- */
- @Override
- public void parse(Ontology ont, OntModel m) {
- this.ont = ont;
- for (Iterator<OntClass> i = rootClasses(m); i.hasNext(); ) {
- OntClass c = i.next();
-
- //dont deal with anonymous classes
- if (c.isAnon()) {
- continue;
- }
-
- parseClass(c, new ArrayList<>(), 0);
- }
- }
-
- protected void parseClass(OntClass cls, List<Object> occurs, int depth) {
- //dont deal with anonymous classes
- if (cls.isAnon()) {
- return;
- }
-
- //add cls to Ontology searchterms
- //list labels
- Iterator<?> labelIter = cls.listLabels(null);
- //if has no labels
- if (!labelIter.hasNext()) {
- //add rdf:ID as a label
- cls.addLabel(rdfidToLabel(cls.getLocalName()), null);
- }
- //reset the label iterator
- labelIter = cls.listLabels(null);
-
- while (labelIter.hasNext()) {
- Literal l = (Literal) labelIter.next();
- ((LocalOntology) ont).addSearchTerm(l.toString(), cls);
- }
-
- // recurse to the next level down
- if (cls.canAs(OntClass.class) && !occurs.contains(cls)) {
- //list subclasses
- for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) {
- OntClass sub = (OntClass) i.next();
-
- // we push this expression on the occurs list before we recurse
- occurs.add(cls);
- parseClass(sub, occurs, depth + 1);
- occurs.remove(cls);
- }
-
- //list instances
- for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) {
- //add search terms for each instance
-
- //list labels
- Individual individual = (Individual) i.next();
- for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) {
- Literal l = (Literal) j.next();
- ((LocalOntology) ont).addSearchTerm(l.toString(), individual);
- }
- }
- }
- }
-
- /**
- * Parses out all root classes of the given
- * {@link org.apache.jena.ontology.OntModel}
- * @param m the {@link org.apache.jena.ontology.OntModel} we wish to obtain
- * all root classes for.
- * @return an {@link java.util.Iterator} of {@link org.apache.jena.ontology.OntClass}
- * elements representing all root classes.
- */
- @Override
- public Iterator<OntClass> rootClasses(OntModel m) {
- Iterator<?> i = m.listClasses();
- if (i.hasNext() && i.next() instanceof OntClass) {
- //assume ontology has root classes
- processSingle(m);
- } else {
- //check for presence of aggregate/collection ontologies such as sweetAll.owl
- processCollection(m);
- }
-
- return roots.iterator();
- }
-
- private void processSingle(OntModel m) {
- for (Iterator<?> i = m.listClasses(); i.hasNext(); ) {
- OntClass c = (OntClass) i.next();
- try {
- // too confusing to list all the restrictions as root classes
- if (c.isAnon()) {
- continue;
- }
-
- if (c.hasSuperClass(m.getProfile().THING(), true) || c.getCardinality(m.getProfile().SUB_CLASS_OF()) == 0) {
- // this class is directly descended from Thing
- roots.add(c);
- }
- } catch (Exception e) {
- Log.error("Error during extraction or root Classes from Ontology Model: ", e);
- }
- }
- }
-
- private void processCollection(OntModel m) {
- for (Iterator<?> i = m.listSubModels(true); i.hasNext(); ) {
- OntModel ontModel = (OntModel) i.next();
- processSingle(ontModel);
- }
- }
-
- public String rdfidToLabel(String idString) {
- Pattern p = Pattern.compile("([a-z0-9])([A-Z])");
- Matcher m = p.matcher(idString);
-
- String labelString = idString;
- while (m.find()) {
- labelString = labelString.replaceAll(m.group(1) + m.group(2), m.group(1) + " " + m.group(2));
- }
- return labelString;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java
deleted file mode 100644
index 3447426..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes ontology processing classes.
- */
-package gov.nasa.jpl.mudrod.ontology.process;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java
deleted file mode 100644
index 1e5d8bf..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes the preprocessing, processing, and data structure used
- * by recommendation module.
- */
-package gov.nasa.jpl.mudrod.recommendation;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java
deleted file mode 100644
index c174f31..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.recommendation.pre;
-
-import com.google.gson.JsonElement;
-import com.google.gson.JsonParser;
-import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import gov.nasa.jpl.mudrod.main.MudrodConstants;
-import gov.nasa.jpl.mudrod.metadata.pre.ApiHarvester;
-import org.apache.commons.io.IOUtils;
-import org.elasticsearch.action.index.IndexRequest;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.*;
-import java.util.Properties;
-
-/**
- * ClassName: Import Metadata to elasticsearch
- */
-
-public class ImportMetadata extends DiscoveryStepAbstract {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- private static final Logger LOG = LoggerFactory.getLogger(ApiHarvester.class);
-
- public ImportMetadata(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- }
-
- @Override
- public Object execute() {
- LOG.info("Starting Metadata Harvesting");
- startTime = System.currentTimeMillis();
- addMetadataMapping();
- importToES();
- endTime = System.currentTimeMillis();
- es.refreshIndex();
- LOG.info("Finished Metadata Harvesting time elapsed: {}s", (endTime - startTime) / 1000);
- return null;
- }
-
- /**
- * addMetadataMapping: Add mapping to index metadata in Elasticsearch. Please
- * invoke this method before import metadata to Elasticsearch.
- */
- public void addMetadataMapping() {
- String mappingJson = "{\r\n \"dynamic_templates\": " + "[\r\n " + "{\r\n \"strings\": " + "{\r\n \"match_mapping_type\": \"string\","
- + "\r\n \"mapping\": {\r\n \"type\": \"string\"," + "\r\n \"analyzer\": \"csv\"\r\n }" + "\r\n }\r\n }\r\n ]\r\n}";
-
- es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType(props.getProperty("recom_metadataType")).setSource(mappingJson).execute().actionGet();
-
- }
-
- /**
- * importToES: Index metadata into elasticsearch from local file directory.
- * Please make sure metadata have been harvest from web service before
- * invoking this method.
- */
- private void importToES() {
- es.deleteType(props.getProperty("indexName"), props.getProperty("recom_metadataType"));
-
- es.createBulkProcessor();
- File directory = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH));
- File[] fList = directory.listFiles();
- for (File file : fList) {
- InputStream is;
- try {
- is = new FileInputStream(file);
- try {
- String jsonTxt = IOUtils.toString(is);
- JsonParser parser = new JsonParser();
- JsonElement item = parser.parse(jsonTxt);
- IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty("recom_metadataType")).source(item.toString());
-
- // preprocessdata
-
- es.getBulkProcessor().add(ir);
- } catch (IOException e) {
- e.printStackTrace();
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
-
- }
-
- es.destroyBulkProcessor();
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java
deleted file mode 100644
index 02c74f0..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Project Name:mudrod-core
- * File Name:TFIDFGenerator.java
- * Package Name:gov.nasa.jpl.mudrod.recommendation.pre
- * Date:Aug 22, 201612:39:52 PM
- * Copyright (c) 2016, chenzhou1025@126.com All Rights Reserved.
- */
-
-package gov.nasa.jpl.mudrod.recommendation.pre;
-
-import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import gov.nasa.jpl.mudrod.recommendation.structure.MetadataOpt;
-import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix;
-import gov.nasa.jpl.mudrod.utils.MatrixUtil;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
-
-/**
- * ClassName: Generate TFIDF information of all metadata
- */
-public class MetadataTFIDFGenerator extends DiscoveryStepAbstract {
-
- private static final long serialVersionUID = 1L;
- private static final Logger LOG = LoggerFactory.getLogger(MetadataTFIDFGenerator.class);
-
- /**
- * Creates a new instance of MatrixGenerator.
- *
- * @param props the Mudrod configuration
- * @param es the Elasticsearch drive
- * @param spark the spark drive
- */
- public MetadataTFIDFGenerator(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- }
-
- @Override
- public Object execute() {
-
- LOG.info("Starting Dataset TF_IDF Matrix Generator");
- startTime = System.currentTimeMillis();
- try {
- generateWordBasedTFIDF();
- } catch (Exception e) {
- LOG.error("Error during Dataset TF_IDF Matrix Generation: {}", e);
- }
- endTime = System.currentTimeMillis();
-
- LOG.info("Dataset TF_IDF Matrix Generation complete, time elaspsed: {}s", (endTime - startTime) / 1000);
-
- return null;
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
-
- public LabeledRowMatrix generateWordBasedTFIDF() throws Exception {
-
- MetadataOpt opt = new MetadataOpt(props);
-
- JavaPairRDD<String, String> metadataContents = opt.loadAll(es, spark);
-
- JavaPairRDD<String, List<String>> metadataWords = opt.tokenizeData(metadataContents, " ");
-
- LabeledRowMatrix wordtfidfMatrix = opt.tFIDFTokens(metadataWords, spark);
-
- MatrixUtil.exportToCSV(wordtfidfMatrix.rowMatrix, wordtfidfMatrix.rowkeys, wordtfidfMatrix.colkeys, props.getProperty("metadata_word_tfidf_matrix"));
-
- return wordtfidfMatrix;
- }
-
- public LabeledRowMatrix generateTermBasedTFIDF() throws Exception {
-
- MetadataOpt opt = new MetadataOpt(props);
-
- List<String> variables = new ArrayList<>();
- variables.add("DatasetParameter-Term");
- variables.add("DatasetParameter-Variable");
- variables.add("Dataset-ExtractTerm");
-
- JavaPairRDD<String, String> metadataContents = opt.loadAll(es, spark, variables);
-
- JavaPairRDD<String, List<String>> metadataTokens = opt.tokenizeData(metadataContents, ",");
-
- LabeledRowMatrix tokentfidfMatrix = opt.tFIDFTokens(metadataTokens, spark);
-
- MatrixUtil.exportToCSV(tokentfidfMatrix.rowMatrix, tokentfidfMatrix.rowkeys, tokentfidfMatrix.colkeys, props.getProperty("metadata_term_tfidf_matrix"));
-
- return tokentfidfMatrix;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java
deleted file mode 100644
index f5eaa9c..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java
+++ /dev/null
@@ -1,223 +0,0 @@
-package gov.nasa.jpl.mudrod.recommendation.pre;
-
-import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.action.update.UpdateRequest;
-import org.elasticsearch.common.unit.TimeValue;
-import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.search.SearchHit;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Properties;
-import java.util.regex.Pattern;
-
-public class NormalizeVariables extends DiscoveryStepAbstract {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- private static final Logger LOG = LoggerFactory.getLogger(NormalizeVariables.class);
- // index name
- private String indexName;
- // type name of metadata in ES
- private String metadataType;
-
- /**
- * Creates a new instance of OHEncoder.
- *
- * @param props the Mudrod configuration
- * @param es an instantiated {@link ESDriver}
- * @param spark an instantiated {@link SparkDriver}
- */
- public NormalizeVariables(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- indexName = props.getProperty("indexName");
- metadataType = props.getProperty("recom_metadataType");
- }
-
- @Override
- public Object execute() {
- LOG.info("*****************processing metadata variables starts******************");
- startTime = System.currentTimeMillis();
-
- normalizeMetadataVariables(es);
-
- endTime = System.currentTimeMillis();
- LOG.info("*****************processing metadata variables ends******************Took {}s", (endTime - startTime) / 1000);
-
- return null;
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
-
- public void normalizeMetadataVariables(ESDriver es) {
-
- es.createBulkProcessor();
-
- SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute()
- .actionGet();
- while (true) {
- for (SearchHit hit : scrollResp.getHits().getHits()) {
- Map<String, Object> metadata = hit.getSource();
- Map<String, Object> updatedValues = new HashMap<>();
-
- this.normalizeSpatialVariables(metadata, updatedValues);
- this.normalizeTemporalVariables(metadata, updatedValues);
- this.normalizeOtherVariables(metadata, updatedValues);
-
- UpdateRequest ur = es.generateUpdateRequest(indexName, metadataType, hit.getId(), updatedValues);
- es.getBulkProcessor().add(ur);
- }
-
- scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
- if (scrollResp.getHits().getHits().length == 0) {
- break;
- }
- }
-
- es.destroyBulkProcessor();
- }
-
- private void normalizeOtherVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) {
- String shortname = (String) metadata.get("Dataset-ShortName");
- double versionNUm = getVersionNum(shortname);
- updatedValues.put("Dataset-Derivative-VersionNum", versionNUm);
-
- }
-
- private Double getVersionNum(String version) {
- if (version == null) {
- return 0.0;
- }
- Double versionNum = 0.0;
- Pattern p = Pattern.compile(".*[a-zA-Z].*");
- if ("Operational/Near-Real-Time".equals(version)) {
- versionNum = 2.0;
- } else if (version.matches("[0-9]{1}[a-zA-Z]{1}")) {
- versionNum = Double.parseDouble(version.substring(0, 1));
- } else if (p.matcher(version).find()) {
- versionNum = 0.0;
- } else {
- versionNum = Double.parseDouble(version);
- if (versionNum >= 5) {
- versionNum = 20.0;
- }
- }
- return versionNum;
- }
-
- private void normalizeSpatialVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) {
-
- // get spatial resolution
- Double spatialR;
- if (metadata.get("Dataset-SatelliteSpatialResolution") != null) {
- spatialR = (Double) metadata.get("Dataset-SatelliteSpatialResolution");
- } else {
- Double gridR = (Double) metadata.get("Dataset-GridSpatialResolution");
- if (gridR != null) {
- spatialR = 111 * gridR;
- } else {
- spatialR = 25.0;
- }
- }
- updatedValues.put("Dataset-Derivative-SpatialResolution", spatialR);
-
- // Transform Longitude and calculate coverage area
- double top = parseDouble((String) metadata.get("DatasetCoverage-NorthLat"));
- double bottom = parseDouble((String) metadata.get("DatasetCoverage-SouthLat"));
- double left = parseDouble((String) metadata.get("DatasetCoverage-WestLon"));
- double right = parseDouble((String) metadata.get("DatasetCoverage-EastLon"));
-
- if (left > 180) {
- left = left - 360;
- }
-
- if (right > 180) {
- right = right - 360;
- }
-
- if (left == right) {
- left = -180;
- right = 180;
- }
-
- double area = (top - bottom) * (right - left);
-
- updatedValues.put("DatasetCoverage-Derivative-EastLon", right);
- updatedValues.put("DatasetCoverage-Derivative-WestLon", left);
- updatedValues.put("DatasetCoverage-Derivative-NorthLat", top);
- updatedValues.put("DatasetCoverage-Derivative-SouthLat", bottom);
- updatedValues.put("DatasetCoverage-Derivative-Area", area);
-
- // get processing level
- String processingLevel = (String) metadata.get("Dataset-ProcessingLevel");
- double dProLevel = this.getProLevelNum(processingLevel);
- updatedValues.put("Dataset-Derivative-ProcessingLevel", dProLevel);
- }
-
- private void normalizeTemporalVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) {
-
- String trStr = (String) metadata.get("Dataset-TemporalResolution");
- if ("".equals(trStr)) {
- trStr = (String) metadata.get("Dataset-TemporalRepeat");
- }
-
- updatedValues.put("Dataset-Derivative-TemporalResolution", covertTimeUnit(trStr));
- }
-
- private Double covertTimeUnit(String str) {
- Double timeInHour;
- if (str.contains("Hour")) {
- timeInHour = Double.parseDouble(str.split(" ")[0]);
- } else if (str.contains("Day")) {
- timeInHour = Double.parseDouble(str.split(" ")[0]) * 24;
- } else if (str.contains("Week")) {
- timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7;
- } else if (str.contains("Month")) {
- timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7 * 30;
- } else if (str.contains("Year")) {
- timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7 * 30 * 365;
- } else {
- timeInHour = 0.0;
- }
-
- return timeInHour;
- }
-
- public Double getProLevelNum(String pro) {
- if (pro == null) {
- return 1.0;
- }
- Double proNum = 0.0;
- Pattern p = Pattern.compile(".*[a-zA-Z].*");
- if (pro.matches("[0-9]{1}[a-zA-Z]{1}")) {
- proNum = Double.parseDouble(pro.substring(0, 1));
- } else if (p.matcher(pro).find()) {
- proNum = 1.0;
- } else {
- proNum = Double.parseDouble(pro);
- }
-
- return proNum;
- }
-
- private double parseDouble(String strNumber) {
- if (strNumber != null && strNumber.length() > 0) {
- try {
- return Double.parseDouble(strNumber);
- } catch (Exception e) {
- return -1;
- }
- } else
- return 0;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java
deleted file mode 100644
index 2aecce3..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/**
- * Project Name:mudrod-core
- * File Name:SessionCooccurenceMatrix.java
- * Package Name:gov.nasa.jpl.mudrod.recommendation.pre
- * Date:Aug 19, 20163:06:33 PM
- * Copyright (c) 2016, chenzhou1025@126.com All Rights Reserved.
- */
-
-package gov.nasa.jpl.mudrod.recommendation.pre;
-
-import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import gov.nasa.jpl.mudrod.main.MudrodConstants;
-import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix;
-import gov.nasa.jpl.mudrod.utils.MatrixUtil;
-import gov.nasa.jpl.mudrod.weblog.structure.SessionExtractor;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.function.PairFunction;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.common.unit.TimeValue;
-import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.search.SearchHit;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import scala.Tuple2;
-
-import java.util.*;
-
-/**
- * ClassName: SessionCooccurenceMatrix Function: Generate metadata session
- * coocucurence matrix from web logs. Each row in the matrix is corresponding to
- * a metadata, and each column is a session.
- */
-public class SessionCooccurence extends DiscoveryStepAbstract {
-
- private static final long serialVersionUID = 1L;
- private static final Logger LOG = LoggerFactory.getLogger(SessionCooccurence.class);
-
- /**
- * Creates a new instance of SessionCooccurence.
- *
- * @param props
- * the Mudrod configuration
- * @param es
- * the Elasticsearch drive
- * @param spark
- * the spark driver
- */
- public SessionCooccurence(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- }
-
- @Override
- public Object execute() {
-
- LOG.info("Starting dataset session-based similarity generation...");
-
- startTime = System.currentTimeMillis();
-
- // get all metadata session cooccurance data
- SessionExtractor extractor = new SessionExtractor();
- JavaPairRDD<String, List<String>> sessionDatasetRDD = extractor.bulidSessionDatasetRDD(props, es, spark);
-
- // remove retired datasets
- JavaPairRDD<String, List<String>> sessionFiltedDatasetsRDD = removeRetiredDataset(es, sessionDatasetRDD);
- LabeledRowMatrix datasetSessionMatrix = MatrixUtil.createWordDocMatrix(sessionFiltedDatasetsRDD);
-
- // export
- MatrixUtil.exportToCSV(datasetSessionMatrix.rowMatrix, datasetSessionMatrix.rowkeys, datasetSessionMatrix.colkeys, props.getProperty("session_metadata_Matrix"));
-
- endTime = System.currentTimeMillis();
-
- LOG.info("Completed dataset session-based similarity generation. Time elapsed: {}s", (endTime - startTime) / 1000);
-
- return null;
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
-
- /**
- * filter out-of-data metadata
- *
- * @param es
- * the Elasticsearch drive
- * @param userDatasetsRDD
- * dataset extracted from session
- * @return filtered session datasets
- */
- public JavaPairRDD<String, List<String>> removeRetiredDataset(ESDriver es, JavaPairRDD<String, List<String>> userDatasetsRDD) {
-
- Map<String, String> nameMap = this.getOnServiceMetadata(es);
-
- return userDatasetsRDD.mapToPair(new PairFunction<Tuple2<String, List<String>>, String, List<String>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, List<String>> call(Tuple2<String, List<String>> arg0) throws Exception {
- List<String> oriDatasets = arg0._2;
- List<String> newDatasets = new ArrayList<>();
- int size = oriDatasets.size();
- for (int i = 0; i < size; i++) {
- String name = oriDatasets.get(i);
- if (nameMap.containsKey(name)) {
- newDatasets.add(nameMap.get(name));
- }
- }
- return new Tuple2<>(arg0._1, newDatasets);
- }
- });
-
- }
-
- /**
- * getMetadataNameMap: Get on service metadata names, key is lowcase of short
- * name and value is the original short name
- *
- * @param es
- * the elasticsearch client
- * @return a map from lower case metadata name to original metadata name
- */
- private Map<String, String> getOnServiceMetadata(ESDriver es) {
-
- String indexName = props.getProperty(MudrodConstants.ES_INDEX_NAME);
- String metadataType = props.getProperty("recom_metadataType");
-
- Map<String, String> shortnameMap = new HashMap<>();
- SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute()
- .actionGet();
- while (true) {
- for (SearchHit hit : scrollResp.getHits().getHits()) {
- Map<String, Object> metadata = hit.getSource();
- String shortName = (String) metadata.get("Dataset-ShortName");
- shortnameMap.put(shortName.toLowerCase(), shortName);
- }
-
- scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
- if (scrollResp.getHits().getHits().length == 0) {
- break;
- }
- }
-
- return shortnameMap;
- }
-
-}