You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sdap.apache.org by le...@apache.org on 2017/12/19 14:13:14 UTC
[13/17] incubator-sdap-mudrod git commit: SDAP-7 Change all package
namespaces to org.apache.sdap
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java
deleted file mode 100644
index 598ad61..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ssearch.ranking;
-
-import au.com.bytecode.opencsv.CSVReader;
-import au.com.bytecode.opencsv.CSVWriter;
-
-import java.io.*;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-/**
- * SVMData is a program designed to create appropriate input data for the RankSVM
- * algorithm that involves Pairwise Classification. Specifically, instead of working in
- * the space of query-document vectors, e.g. x1, x2, x3, we transform them into a new space
- * in which a pair of documents is represented as the difference between their feature vectors.
- */
-public class DataGenerator {
- private static String mySourceDir;
- private static String myResultDir;
- private static boolean isMultFiles;
-
- private static String[] myHeader;
- private static List<List<String>> myMasterList = new ArrayList<List<String>>();
-
- // HashMap used for comparing evaluation classes
- public static final HashMap<String, Integer> map1 = new HashMap<String, Integer>();
-
- static {
- map1.put("Excellent", 7);
- map1.put("Very good", 6);
- map1.put("Good", 5);
- map1.put("OK", 4);
- map1.put("Bad", 3);
- map1.put("Very bad", 2);
- map1.put("Terrible", 1);
- }
-
- /**
- * Constructor which takes in path containing one or multiple files to process.
- * Also takes in argument specifying whether or not a single file needs to be processed,
- * or multiple files need to be processed.
- *
- * @param sourceDir directory containing single file or multiple files to be processed
- * @param resultDir output folder
- * @param multFiles true if multiple files in directory need to be processed and false if
- * only a single file needs to be processed
- */
- public DataGenerator(String sourceDir, String resultDir, boolean multFiles) {
- mySourceDir = sourceDir;
- myResultDir = resultDir;
- isMultFiles = multFiles;
- }
-
- /**
- * Responsible for invoking the processing of data file(s) and their subsequent storage
- * into a user specified directory.
- */
- public void process() {
- parseFile();
- writeCSVfile(myMasterList);
- }
-
- /**
- * Parses the original user-specified CSV file, storing the contents for future calculations
- * and formatting.
- */
- public static void parseFile() {
- String[][] dataArr = null;
- try {
- String sourceDir = mySourceDir;
-
- if (isMultFiles == true) // Case where multiple files have to be processed
- {
- // Iterate over files in directory
- File directory = new File(sourceDir);
- File[] directoryListing = directory.listFiles();
-
- if (directoryListing != null) {
- for (File child : directoryListing) {
- CSVReader csvReader = new CSVReader(new FileReader(child));
- List<String[]> list = csvReader.readAll();
-
- // Store into 2D array by transforming array list to normal array
- dataArr = new String[list.size()][];
- dataArr = list.toArray(dataArr);
-
- calculateVec(dataArr);
-
- csvReader.close();
- }
- storeHead(dataArr); // Store the header
- }
- } else // Process only one file
- {
- File file = new File(sourceDir);
-
- if (file != null) {
- CSVReader csvReader = new CSVReader(new FileReader(file));
- List<String[]> list = csvReader.readAll();
-
- // Store into 2D array by transforming array list to normal array
- dataArr = new String[list.size()][];
- dataArr = list.toArray(dataArr);
-
- storeHead(dataArr); // Store the header
- calculateVec(dataArr);
-
- csvReader.close();
- }
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Performs the necessary vector calculations on each possible combination of vectors,
- * also storing a value that indicates the evaluation.
- *
- * @param arr the parsed contents of the original CSV file
- */
- public static void calculateVec(String[][] arr) {
- List<List<String>> listofLists = new ArrayList<List<String>>(); // Holds calculations
-
- int rowStart = 1;
- for (int row = rowStart; row < arr.length; row++) // Start at row 1 because row 0 is heading lol
- {
- for (int i = 1; i < arr.length - row; i++) {
- List<String> colList = new ArrayList<String>(); // create vector to store all values inside of a column, which is stored inside 2D vector
- for (int col = 0; col < arr[0].length - 1; col++) // Columns go until the next to last column
- {
- //System.out.println(col + " " + arr[row][col]);
- // Extract double value from each cell
- double x1 = Double.parseDouble(arr[row][col]);
- double x2 = Double.parseDouble(arr[row + i][col]);
-
- // Perform calculation for each cell
- double result = x1 - x2;
-
- // Convert this double value into string, and store inside array list
- String strResult = Double.toString(result);
- colList.add(strResult);
- }
-
- // Finally, add either 1, -1, or do not add row at all when encountering evaluation value
- int addEvalNum = compareEvaluation(arr[row][arr[0].length - 1], arr[row + i][arr[0].length - 1]);
- if (addEvalNum == 1) {
- colList.add("1");
- listofLists.add(colList); // Add this list to 2D list - row is finished now, move on
- } else if (addEvalNum == -1) {
- colList.add("-1");
- listofLists.add(colList); // Add this list to 2D list - row is finished now, move on
- }
- // Else, they are equal, do not even add this row to 2D vector
- }
- }
-
- // After all processing takes place, send to method that recreates data with equal # of 1's and -1's
- List<List<String>> equalizedList = equalizeList(listofLists);
- myMasterList.addAll(equalizedList);
- }
-
- /**
- * Taking in two vector evaluation parameters, compares these two evaluations, returning a 1
- * if the first evaluation is greater than the second, a -1 in the case the first evaluation is
- * less than the second, and a 10 in the case that the two are equal, meaning this vector will
- * not be used.
- *
- * @param eval1 evaluation from first vector
- * @param eval2 evaluation from second vector
- * @return 1 if first evaluation is greater than the second, -1 if first evaluation is less than the second, and
- * 10 in the case that the two are equal
- */
- public static int compareEvaluation(String eval1, String eval2) {
- int evalNum1 = map1.get(eval1);
- int evalNum2 = map1.get(eval2);
-
- if (evalNum1 > evalNum2) // ">" means it is more relevant - assign a 1
- {
- return 1;
- } else if (evalNum1 < evalNum2) {
- return -1;
- } else {
- return 10; // Return 10 if they are equal - signifies you cannot use the row
- }
- }
-
- /**
- * After vector calculations and new evaluation values are set, produces refined output data such that
- * there is an equal or close to equal number of rows containing both "1" and "-1" as the new evaluation value.
- *
- * @param rawList originally calculated data from the input CSV file
- * @return data that has an equal distribution of evaluation values
- */
- public static List<List<String>> equalizeList(List<List<String>> rawList) {
- // Create two sets - one containing row index for +1 and the other for -1
- List<Integer> pos1List = new ArrayList<Integer>();
- List<Integer> neg1List = new ArrayList<Integer>();
-
- for (int i = 0; i < rawList.size(); i++) // Iterate through all rows to get indexes
- {
- int evalNum = Integer.parseInt(rawList.get(i).get(rawList.get(0).size() - 1)); // Get 1 or -1 from original array list
- if (evalNum == 1) {
- pos1List.add(i); // Add row index that has 1
- } else if (evalNum == -1) {
- neg1List.add(i); // Add row index that has -1
- }
- }
-
- int totPosCount = pos1List.size(); // Total # of 1's
- int totNegCount = neg1List.size(); // Total # of -1's
-
- if ((totPosCount - totNegCount) >= 1) // There are more 1's than -1's, equalize them
- {
- int indexOfPosList = 0; // Start getting indexes from the first index of positive index location list
- while ((totPosCount - totNegCount) >= 1) // Keep going until we have acceptable amount of both +1 and -1
- {
- int pos1IndexVal = pos1List.get(indexOfPosList); // Get index from previously made list of indexes
- for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to -1 row
- {
- double d = Double.parseDouble(rawList.get(pos1IndexVal).get(col)); // Transform to double first
- d = d * -1; // Negate it
- String negatedValue = Double.toString(d); // Change back to String
- rawList.get(pos1IndexVal).set(col, negatedValue);// Put this value back into dat row
- }
-
- totPosCount--; // We changed a +1 row to a -1 row, decrement count of positives
- totNegCount++; // Increment count of negatives
- indexOfPosList++; // Get next +1 location in raw data
- }
- } else if ((totNegCount - totPosCount) > 1) // There are more -1's than 1's, equalize them
- {
- int indexOfNegList = 0;
- while ((totNegCount - totPosCount) > 1) // Keep going until we have acceptable amount of both +1 and -1
- {
- int neg1IndexVal = neg1List.get(indexOfNegList); // Get index from previously made list of indexes
- for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to +1 row
- {
- double d = Double.parseDouble(rawList.get(neg1IndexVal).get(col)); // Transform to double first
- d = d * -1; // Negate it
- String negatedValue = Double.toString(d); // Change back to String
- rawList.get(neg1IndexVal).set(col, negatedValue);// Put this value back into dat row
- }
-
- totNegCount--; // We changed a -1 row to a +1 row, decrement count of negatives now
- totPosCount++; // Increment count of positives
- indexOfNegList++; // Get next -1 location in raw data
- }
- } else {
- // Do nothing - rows are within acceptable equality bounds of plus or minus 1
- }
-
- return rawList;
- }
-
- /**
- * Retrieves the heading from a file to be processed so it can be written to the output file later.
- *
- * @param arr 2D array containing the parsed information from input file
- */
- public static void storeHead(String[][] arr) {
- myHeader = new String[arr[0].length]; // Reside private variable
-
- for (int col = 0; col < arr[0].length; col++) {
- myHeader[col] = arr[0][col];
- }
- }
-
- /**
- * Writes newly calculated and equally distributed vector data to user specified CSV file.
- *
- * @param list finalized vector data to write to user specified output file
- */
- public static void writeCSVfile(List<List<String>> list) {
- String outputFile = myResultDir;
- boolean alreadyExists = new File(outputFile).exists();
-
- try {
- CSVWriter csvOutput = new CSVWriter(new FileWriter(outputFile), ','); // Create new instance of CSVWriter to write to file output
-
- if (!alreadyExists) {
- csvOutput.writeNext(myHeader); // Write the text headers first before data
-
- for (int i = 0; i < list.size(); i++) // Iterate through all rows in 2D array
- {
- String[] temp = new String[list.get(i).size()]; // Convert row array list in 2D array to regular string array
- temp = list.get(i).toArray(temp);
- csvOutput.writeNext(temp); // Write this array to the file
- }
- }
-
- csvOutput.close(); // Close csvWriter
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java
deleted file mode 100644
index 8edb6ad..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ssearch.ranking;
-
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
-
-/**
- * Supports ability to evaluating ranking results
- */
-public class Evaluator {
- /**
- * Method of calculating NDCG score
- *
- * @param list a list of integer with each integer element indicating
- * the performance at its position
- * @param K the number of elements needed to be included in the calculation
- * @return NDCG score
- */
- public double getNDCG(int[] list, int K) {
- double dcg = this.getDCG(list, K);
- double idcg = this.getIDCG(list, K);
- double ndcg = 0.0;
- if (idcg > 0.0) {
- ndcg = dcg / idcg;
- }
- return ndcg;
- }
-
- /**
- * Method of getting the precision of a list at position K
- *
- * @param list a list of integer with each integer element indicating
- * the performance at its position
- * @param K the number of elements needed to be included in the calculation
- * @return precision at K
- */
- public double getPrecision(int[] list, int K) {
- int size = list.length;
- if (size == 0 || K == 0) {
- return 0;
- }
-
- if (K > size) {
- K = size;
- }
-
- int rel_doc_num = this.getRelevantDocNum(list, K);
- double precision = (double) rel_doc_num / (double) K;
- return precision;
- }
-
- /**
- * Method of getting the number of relevant element in a ranking results
- *
- * @param list a list of integer with each integer element indicating
- * the performance at its position
- * @param K the number of elements needed to be included in the calculation
- * @return the number of relevant element
- */
- private int getRelevantDocNum(int[] list, int K) {
- int size = list.length;
- if (size == 0 || K == 0) {
- return 0;
- }
-
- if (K > size) {
- K = size;
- }
-
- int rel_num = 0;
- for (int i = 0; i < K; i++) {
- if (list[i] > 3) { // 3 refers to "OK"
- rel_num++;
- }
- }
- return rel_num;
- }
-
- /**
- * Method of calculating DCG score from a list of ranking results
- *
- * @param list a list of integer with each integer element indicating
- * the performance at its position
- * @param K the number of elements needed to be included in the calculation
- * @return DCG score
- */
- private double getDCG(int[] list, int K) {
- int size = list.length;
- if (size == 0 || K == 0) {
- return 0.0;
- }
-
- if (K > size) {
- K = size;
- }
-
- double dcg = list[0];
- for (int i = 1; i < K; i++) {
- int rel = list[i];
- int pos = i + 1;
- double rel_log = Math.log(pos) / Math.log(2);
- dcg += rel / rel_log;
- }
- return dcg;
- }
-
- /**
- * Method of calculating ideal DCG score from a list of ranking results
- *
- * @param list a list of integer with each integer element indicating
- * the performance at its position
- * @param K the number of elements needed to be included in the calculation
- * @return IDCG score
- */
- private double getIDCG(int[] list, int K) {
- Comparator<Integer> comparator = new Comparator<Integer>() {
- @Override
- public int compare(Integer o1, Integer o2) {
- return o2.compareTo(o1);
- }
- };
- List<Integer> sortlist = IntStream.of(list).boxed().collect(Collectors.toList());
- ;
- Collections.sort(sortlist, comparator);
- int[] sortedArr = sortlist.stream().mapToInt(i -> i).toArray();
- double idcg = this.getDCG(sortedArr, K);
- return idcg;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java
deleted file mode 100644
index d1c5199..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ssearch.ranking;
-
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import org.apache.spark.SparkContext;
-import org.apache.spark.mllib.classification.SVMModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-
-import java.io.Serializable;
-
-/**
- * Supports the ability to importing classifier into memory
- */
-public class Learner implements Serializable {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- private static final String SPARKSVM = "SparkSVM";
- SVMModel model = null;
- transient SparkContext sc = null;
-
- /**
- * Constructor to load in spark SVM classifier
- *
- * @param classifierName classifier type
- * @param skd an instance of spark driver
- * @param svmSgdModel path to a trained model
- */
- public Learner(String classifierName, SparkDriver skd, String svmSgdModel) {
- if (classifierName.equals(SPARKSVM)) {
- sc = skd.sc.sc();
- sc.addFile(svmSgdModel, true);
- model = SVMModel.load(sc, svmSgdModel);
- }
- }
-
- /**
- * Method of classifying instance
- *
- * @param p the instance that needs to be classified
- * @return the class id
- */
- public double classify(LabeledPoint p) {
- return model.predict(p.features());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java
deleted file mode 100644
index ba46d41..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java
+++ /dev/null
@@ -1,55 +0,0 @@
-package gov.nasa.jpl.mudrod.ssearch.ranking;
-
-import java.io.*;
-import java.text.DecimalFormat;
-
-public class SparkFormatter {
- DecimalFormat NDForm = new DecimalFormat("#.###");
-
- public SparkFormatter() {
- }
-
- public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) {
- File file = new File(outputTXTFileName);
- if (file.exists()) {
- file.delete();
- }
- try {
- file.createNewFile();
- FileWriter fw = new FileWriter(outputTXTFileName);
- BufferedWriter bw = new BufferedWriter(fw);
-
- BufferedReader br = new BufferedReader(new FileReader(inputCSVFileName));
- br.readLine();
- String line = br.readLine();
- while (line != null) {
- String[] list = line.split(",");
- String output = "";
- Double label = Double.parseDouble(list[list.length - 1].replace("\"", ""));
- if (label == -1.0) {
- output = "0 ";
- } else if (label == 1.0) {
- output = "1 ";
- }
-
- for (int i = 0; i < list.length - 1; i++) {
- int index = i + 1;
- output += index + ":" + NDForm.format(Double.parseDouble(list[i].replace("\"", ""))) + " ";
- }
- bw.write(output + "\n");
-
- line = br.readLine();
- }
- br.close();
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public static void main(String[] args) {
- SparkFormatter sf = new SparkFormatter();
- sf.toSparkSVMformat("C:/mudrodCoreTestData/rankingResults/inputDataForSVM.csv", "C:/mudrodCoreTestData/rankingResults/inputDataForSVM_spark.txt");
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java
deleted file mode 100644
index 1ddebf3..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ssearch.ranking;
-
-import gov.nasa.jpl.mudrod.main.MudrodEngine;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.classification.SVMModel;
-import org.apache.spark.mllib.classification.SVMWithSGD;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-
-public class SparkSVM {
-
- private SparkSVM() {
- //public constructor
- }
-
- public static void main(String[] args) {
- MudrodEngine me = new MudrodEngine();
-
- JavaSparkContext jsc = me.startSparkDriver().sc;
-
- String path = SparkSVM.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString();
- JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
-
- // Run training algorithm to build the model.
- int numIterations = 100;
- final SVMModel model = SVMWithSGD.train(data.rdd(), numIterations);
-
- // Save and load model
- model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString());
-
- jsc.sc().stop();
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java
deleted file mode 100644
index ae48b55..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ssearch.ranking;
-
-import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import gov.nasa.jpl.mudrod.main.MudrodConstants;
-import org.elasticsearch.action.index.IndexRequest;
-import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.index.query.QueryBuilders;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.Properties;
-
-import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
-
-/**
- * Supports the ability to importing training set into Elasticsearch
- */
-public class TrainingImporter extends MudrodAbstract {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- public TrainingImporter(Properties props, ESDriver es, SparkDriver spark) {
- super(props, es, spark);
- es.deleteAllByQuery(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking", QueryBuilders.matchAllQuery());
- addMapping();
- }
-
- /**
- * Method of adding mapping to traning set type
- */
- public void addMapping() {
- XContentBuilder Mapping;
- try {
- Mapping = jsonBuilder().startObject().startObject("trainingranking").startObject("properties").startObject("query").field("type", "string").field("index", "not_analyzed").endObject()
- .startObject("dataID").field("type", "string").field("index", "not_analyzed").endObject().startObject("label").field("type", "string").field("index", "not_analyzed").endObject().endObject()
- .endObject().endObject();
-
- es.getClient().admin().indices().preparePutMapping(props.getProperty("indexName")).setType("trainingranking").setSource(Mapping).execute().actionGet();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Method of importing training set in to Elasticsearch
- *
- * @param dataFolder the path to the traing set
- * @throws IOException IOException
- */
- public void importTrainingSet(String dataFolder) throws IOException {
- es.createBulkProcessor();
-
- File[] files = new File(dataFolder).listFiles();
- for (File file : files) {
- BufferedReader br = new BufferedReader(new FileReader(file.getAbsolutePath()));
- br.readLine();
- String line = br.readLine();
- while (line != null) {
- String[] list = line.split(",");
- String query = file.getName().replace(".csv", "");
- if (list.length > 0) {
- IndexRequest ir = new IndexRequest(props.getProperty("indexName"), "trainingranking")
- .source(jsonBuilder().startObject().field("query", query).field("dataID", list[0]).field("label", list[list.length - 1]).endObject());
- es.getBulkProcessor().add(ir);
- }
- line = br.readLine();
- }
- br.close();
- }
- es.destroyBulkProcessor();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java
deleted file mode 100644
index 205e7a7..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes classes for importing training data, ML models,
- * generating input data for RankSVM, and evaluating ranking results
- */
-package gov.nasa.jpl.mudrod.ssearch.ranking;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java
deleted file mode 100644
index 33b6233..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.ssearch.structure;
-
-import java.lang.reflect.Field;
-
-/**
- * Data structure class for search result
- */
-public class SResult {
- public static final String rlist[] = { "term_score", "releaseDate_score", /*"versionNum_score",*/
- "processingL_score", "allPop_score", "monthPop_score", "userPop_score"/*, "termAndv_score"*/ };
- String shortName = null;
- String longName = null;
- String topic = null;
- String description = null;
- String relase_date = null;
-
- public Double final_score = 0.0;
- public Double term_score = 0.0;
- public Double releaseDate_score = 0.0;
- public Double versionNum_score = 0.0;
- public Double processingL_score = 0.0;
- public Double click_score = 0.0;
- public Double allPop_score = 0.0;
- public Double monthPop_score = 0.0;
- public Double userPop_score = 0.0;
- public Double termAndv_score = 0.0;
- public Integer below = 0;
-
- public Double Dataset_LongName_score = null;
- public Double Dataset_Metadata_score = null;
- public Double DatasetParameter_Term_score = null;
- public Double DatasetSource_Source_LongName_score = null;
- public Double DatasetSource_Sensor_LongName_score = null;
-
- public String version = null;
- public String processingLevel = null;
- public String latency = null;
- public String stopDateLong = null;
- public String stopDateFormat = null;
- public Double spatialR_Sat = null;
- public Double spatialR_Grid = null;
- public String temporalR = null;
-
- public Double releaseDate = null;
- public Double click = null;
- public Double term = null;
- public Double versionNum = null;
- public Double processingL = null;
- public Double allPop = null;
- public Double monthPop = null;
- public Double userPop = null;
- public Double termAndv = null;
-
- public Double Dataset_LongName = null;
- public Double Dataset_Metadata = null;
- public Double DatasetParameter_Term = null;
- public Double DatasetSource_Source_LongName = null;
- public Double DatasetSource_Sensor_LongName = null;
-
- public Double prediction = 0.0;
- public String label = null;
-
- //add by quintinali
- public String startDate;
- public String endDate;
- public String sensors;
-
- /**
- * @param shortName short name of dataset
- * @param longName long name of dataset
- * @param topic topic of dataset
- * @param description description of dataset
- * @param date release date of dataset
- */
- public SResult(String shortName, String longName, String topic, String description, String date) {
- this.shortName = shortName;
- this.longName = longName;
- this.topic = topic;
- this.description = description;
- this.relase_date = date;
- }
-
- public SResult(SResult sr) {
- for (int i = 0; i < rlist.length; i++) {
- set(this, rlist[i], get(sr, rlist[i]));
- }
- }
-
- /**
- * Method of getting export header
- *
- * @param delimiter the delimiter used to separate strings
- * @return header
- */
- public static String getHeader(String delimiter) {
- String str = "";
- for (int i = 0; i < rlist.length; i++) {
- str += rlist[i] + delimiter;
- }
- str = str + "label" + "\n";
- return "ShortName" + delimiter + "below" + delimiter + str;
- }
-
- /**
- * Method of get a search results as string
- *
- * @param delimiter the delimiter used to separate strings
- * @return search result as string
- */
- public String toString(String delimiter) {
- String str = "";
- for (int i = 0; i < rlist.length; i++) {
- double score = get(this, rlist[i]);
- str += score + delimiter;
- }
- str = str + label + "\n";
- return shortName + delimiter + below + delimiter + str;
- }
-
- /**
- * Generic setter method
- *
- * @param object instance of SResult
- * @param fieldName field name that needs to be set on
- * @param fieldValue field value that needs to be set to
- * @return 1 means success, and 0 otherwise
- */
- public static boolean set(Object object, String fieldName, Object fieldValue) {
- Class<?> clazz = object.getClass();
- while (clazz != null) {
- try {
- Field field = clazz.getDeclaredField(fieldName);
- field.setAccessible(true);
- field.set(object, fieldValue);
- return true;
- } catch (NoSuchFieldException e) {
- clazz = clazz.getSuperclass();
- } catch (Exception e) {
- throw new IllegalStateException(e);
- }
- }
- return false;
- }
-
- /**
- * Generic getter method
- *
- * @param object instance of SResult
- * @param fieldName field name of search result
- * @param <V> data type
- * @return the value of the filed in the object
- */
- @SuppressWarnings("unchecked")
- public static <V> V get(Object object, String fieldName) {
- Class<?> clazz = object.getClass();
- while (clazz != null) {
- try {
- Field field = clazz.getDeclaredField(fieldName);
- field.setAccessible(true);
- return (V) field.get(object);
- } catch (NoSuchFieldException e) {
- clazz = clazz.getSuperclass();
- } catch (Exception e) {
- throw new IllegalStateException(e);
- }
- }
- return null;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java
deleted file mode 100644
index a0f9ce5..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes data structure needed for ranking process
- */
-package gov.nasa.jpl.mudrod.ssearch.structure;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java
deleted file mode 100644
index 151ac8d..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package gov.nasa.jpl.mudrod.utils;
-
-import org.elasticsearch.client.transport.TransportClient;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.index.reindex.ReindexPlugin;
-import org.elasticsearch.percolator.PercolatorPlugin;
-import org.elasticsearch.plugins.Plugin;
-import org.elasticsearch.script.mustache.MustachePlugin;
-import org.elasticsearch.transport.Netty3Plugin;
-
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-
-/**
- * A builder to create an instance of {@link TransportClient} This class
- * pre-installs the {@link Netty3Plugin}, for the client. These plugins are all
- * elasticsearch core modules required.
- */
-@SuppressWarnings({ "unchecked", "varargs" })
-public class ESTransportClient extends TransportClient {
-
- private static final Collection<Class<? extends Plugin>> PRE_INSTALLED_PLUGINS = Collections
- .unmodifiableList(Arrays.asList(ReindexPlugin.class, PercolatorPlugin.class, MustachePlugin.class, Netty3Plugin.class));
-
- @SafeVarargs
- public ESTransportClient(Settings settings, Class<? extends Plugin>... plugins) {
- this(settings, Arrays.asList(plugins));
- }
-
- public ESTransportClient(Settings settings, Collection<Class<? extends Plugin>> plugins) {
- super(settings, Settings.EMPTY, addPlugins(plugins, PRE_INSTALLED_PLUGINS), null);
-
- }
-
- @Override
- public void close() {
- super.close();
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java
deleted file mode 100644
index be0a46d..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.HttpURLConnection;
-import java.net.URL;
-
-/**
- * ClassName: HttpRequest
- * Function: Http request tool.
- */
-public class HttpRequest {
-
- private static final Logger LOG = LoggerFactory.getLogger(HttpRequest.class);
-
- public HttpRequest() {
- }
-
- public String getRequest(String requestUrl) {
- String line = null;
- try {
- URL url = new URL(requestUrl);
-
- HttpURLConnection connection = (HttpURLConnection) url.openConnection();
- connection.setDoOutput(true);
-
- connection.setConnectTimeout(5000);
- connection.setReadTimeout(5000);
- int code = connection.getResponseCode();
- if (code != HttpURLConnection.HTTP_OK) {
- line = "{\"exception\":\"Service failed\"}";
- LOG.info(line);
- } else {
- InputStream content = connection.getInputStream();
- BufferedReader in = new BufferedReader(new InputStreamReader(content));
- line = in.readLine();
- }
- } catch (Exception e) {
- line = "{\"exception\":\"No service was found\"}";
- LOG.error(line);
- }
- return line;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java
deleted file mode 100644
index d1d144b..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-
-import java.util.List;
-
-/**
- * ClassName: LabeledRowMatrix
- * Function: LabeledRowMatrix strut.
- */
-public class LabeledRowMatrix {
-
- // words: matrix row titles
- public List<String> rowkeys;
- // docs: matrix column titles
- public List<String> colkeys;
- // wordDocMatrix: a matrix in which each row is corresponding to a term and
- // each column is a doc.
- public RowMatrix rowMatrix;
-
- public LabeledRowMatrix() {
- // TODO Auto-generated constructor stub
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java
deleted file mode 100644
index 90b1568..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import org.elasticsearch.action.index.IndexRequest;
-import org.elasticsearch.action.search.SearchRequestBuilder;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.action.update.UpdateRequest;
-import org.elasticsearch.common.unit.TimeValue;
-import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.search.SearchHit;
-import org.elasticsearch.search.aggregations.AggregationBuilders;
-import org.elasticsearch.search.aggregations.bucket.terms.Terms;
-import org.elasticsearch.search.sort.SortOrder;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.text.DecimalFormat;
-import java.util.List;
-import java.util.Map;
-
-import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
-
-/**
- * ClassName: LinkageTriple Function: Vocabulary linkage operations
- */
-public class LinkageTriple implements Serializable {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- // keyAId: ID of term A
- public long keyAId;
- // keyBId: ID of term B
- public long keyBId;
- // weight: The relationship between term A and Term B
- public double weight;
- // keyA: TermA
- public String keyA;
- // keyB: TermB
- public String keyB;
- // df: Format number
- public static DecimalFormat df = new DecimalFormat("#.00");
-
- public LinkageTriple() {
- // TODO Auto-generated constructor stub
- }
-
- /**
- * TODO Output linkage triples in string format.
- *
- * @see java.lang.Object#toString()
- */
- @Override
- public String toString() {
- return keyA + "," + keyB + ":" + weight;
- }
-
- public static void insertTriples(ESDriver es, List<LinkageTriple> triples, String index, String type) throws IOException {
- LinkageTriple.insertTriples(es, triples, index, type, false, false);
- }
-
- public static void insertTriples(ESDriver es, List<LinkageTriple> triples, String index, String type, Boolean bTriple, boolean bSymmetry) throws IOException {
- es.deleteType(index, type);
- if (bTriple) {
- LinkageTriple.addMapping(es, index, type);
- }
-
- if (triples == null) {
- return;
- }
-
- es.createBulkProcessor();
- int size = triples.size();
- for (int i = 0; i < size; i++) {
-
- XContentBuilder jsonBuilder = jsonBuilder().startObject();
- if (bTriple) {
-
- jsonBuilder.field("concept_A", triples.get(i).keyA);
- jsonBuilder.field("concept_B", triples.get(i).keyB);
-
- } else {
- jsonBuilder.field("keywords", triples.get(i).keyA + "," + triples.get(i).keyB);
- }
-
- jsonBuilder.field("weight", Double.parseDouble(df.format(triples.get(i).weight)));
- jsonBuilder.endObject();
-
- IndexRequest ir = new IndexRequest(index, type).source(jsonBuilder);
- es.getBulkProcessor().add(ir);
-
- if (bTriple && bSymmetry) {
- XContentBuilder symmetryJsonBuilder = jsonBuilder().startObject();
- symmetryJsonBuilder.field("concept_A", triples.get(i).keyB);
- symmetryJsonBuilder.field("concept_B", triples.get(i).keyA);
-
- symmetryJsonBuilder.field("weight", Double.parseDouble(df.format(triples.get(i).weight)));
-
- symmetryJsonBuilder.endObject();
-
- IndexRequest symmetryir = new IndexRequest(index, type).source(symmetryJsonBuilder);
- es.getBulkProcessor().add(symmetryir);
- }
- }
- es.destroyBulkProcessor();
- }
-
- public static void addMapping(ESDriver es, String index, String type) {
- XContentBuilder Mapping;
- try {
- Mapping = jsonBuilder().startObject().startObject(type).startObject("properties").startObject("concept_A").field("type", "string").field("index", "not_analyzed").endObject()
- .startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject()
-
- .endObject().endObject().endObject();
-
- es.getClient().admin().indices().preparePutMapping(index).setType(type).setSource(Mapping).execute().actionGet();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public static void standardTriples(ESDriver es, String index, String type) throws IOException {
- es.createBulkProcessor();
-
- SearchResponse sr = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.matchAllQuery()).setSize(0)
- .addAggregation(AggregationBuilders.terms("concepts").field("concept_A").size(0)).execute().actionGet();
- Terms concepts = sr.getAggregations().get("concepts");
-
- for (Terms.Bucket entry : concepts.getBuckets()) {
- String concept = (String) entry.getKey();
- double maxSim = LinkageTriple.getMaxSimilarity(es, index, type, concept);
- if (maxSim == 1.0) {
- continue;
- }
-
- SearchResponse scrollResp = es.getClient().prepareSearch(index).setTypes(type).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.termQuery("concept_A", concept))
- .addSort("weight", SortOrder.DESC).setSize(100).execute().actionGet();
-
- while (true) {
- for (SearchHit hit : scrollResp.getHits().getHits()) {
- Map<String, Object> metadata = hit.getSource();
- double sim = (double) metadata.get("weight");
- double newSim = sim / maxSim;
- UpdateRequest ur = es.generateUpdateRequest(index, type, hit.getId(), "weight", Double.parseDouble(df.format(newSim)));
- es.getBulkProcessor().add(ur);
- }
-
- scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
- if (scrollResp.getHits().getHits().length == 0) {
- break;
- }
- }
- }
-
- es.destroyBulkProcessor();
- }
-
- private static double getMaxSimilarity(ESDriver es, String index, String type, String concept) {
-
- double maxSim = 1.0;
- SearchRequestBuilder builder = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.termQuery("concept_A", concept)).addSort("weight", SortOrder.DESC).setSize(1);
-
- SearchResponse usrhis = builder.execute().actionGet();
- SearchHit[] hits = usrhis.getHits().getHits();
- if (hits.length == 1) {
- SearchHit hit = hits[0];
- Map<String, Object> result = hit.getSource();
- maxSim = (double) result.get("weight");
- }
-
- if (maxSim == 0.0) {
- maxSim = 1.0;
- }
-
- return maxSim;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java
deleted file mode 100644
index 942f1e0..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.Optional;
-import org.apache.spark.api.java.function.*;
-import org.apache.spark.mllib.feature.IDF;
-import org.apache.spark.mllib.feature.IDFModel;
-import org.apache.spark.mllib.linalg.*;
-import org.apache.spark.mllib.linalg.distributed.IndexedRow;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-import scala.Tuple2;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.stream.Stream;
-
-/**
- * Matrix utility tool
- */
-public class MatrixUtil {
-
- private MatrixUtil() {
- }
-
- /**
- * buildSVDMatrix: Generate SVD matrix from TF-IDF matrix. Please make sure
- * the TF-IDF matrix has been already built from the original documents.
- *
- * @param tfidfMatrix,
- * each row is a term and each column is a document name and each
- * cell is the TF-IDF value of the term in the corresponding
- * document.
- * @param dimension
- * Column number of the SVD matrix
- * @return RowMatrix, each row is a term and each column is a dimension in the
- * feature space, each cell is value of the term in the corresponding
- * dimension.
- */
- public static RowMatrix buildSVDMatrix(RowMatrix tfidfMatrix, int dimension) {
- int matrixCol = (int) tfidfMatrix.numCols();
- if (matrixCol < dimension) {
- dimension = matrixCol;
- }
-
- SingularValueDecomposition<RowMatrix, Matrix> svd = tfidfMatrix.computeSVD(dimension, true, 1.0E-9d);
- RowMatrix u = svd.U();
- Vector s = svd.s();
- return u.multiply(Matrices.diag(s));
- }
-
- /**
- * buildSVDMatrix: Generate SVD matrix from Vector RDD.
- *
- * @param vecRDD
- * vectors of terms in feature space
- * @param dimension
- * Column number of the SVD matrix
- * @return RowMatrix, each row is a term and each column is a dimension in the
- * feature space, each cell is value of the term in the corresponding
- * dimension.
- */
- public static RowMatrix buildSVDMatrix(JavaRDD<Vector> vecRDD, int dimension) {
- RowMatrix tfidfMatrix = new RowMatrix(vecRDD.rdd());
- SingularValueDecomposition<RowMatrix, Matrix> svd = tfidfMatrix.computeSVD(dimension, true, 1.0E-9d);
- RowMatrix u = svd.U();
- Vector s = svd.s();
- return u.multiply(Matrices.diag(s));
- }
-
- /**
- * Create TF-IDF matrix from word-doc matrix.
- *
- * @param wordDocMatrix,
- * each row is a term, each column is a document name and each cell
- * is number of the term in the corresponding document.
- * @return RowMatrix, each row is a term and each column is a document name
- * and each cell is the TF-IDF value of the term in the corresponding
- * document.
- */
- public static RowMatrix createTFIDFMatrix(RowMatrix wordDocMatrix) {
- JavaRDD<Vector> newcountRDD = wordDocMatrix.rows().toJavaRDD();
- IDFModel idfModel = new IDF().fit(newcountRDD);
- JavaRDD<Vector> idf = idfModel.transform(newcountRDD);
- return new RowMatrix(idf.rdd());
- }
-
- /**
- * Create matrix from doc-terms JavaPairRDD.
- *
- * @param uniqueDocRDD
- * doc-terms JavaPairRDD, in which each key is a doc name, and value
- * is term list extracted from that doc
- * @return LabeledRowMatrix {@link LabeledRowMatrix}
- */
- public static LabeledRowMatrix createWordDocMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD) {
- // Index documents with unique IDs
- JavaPairRDD<List<String>, Long> corpus = uniqueDocRDD.values().zipWithIndex();
- // cal word-doc numbers
- JavaPairRDD<Tuple2<String, Long>, Double> worddocNumRDD = corpus.flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, Long>, Tuple2<String, Long>, Double>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Iterator<Tuple2<Tuple2<String, Long>, Double>> call(Tuple2<List<String>, Long> docwords) throws Exception {
- List<Tuple2<Tuple2<String, Long>, Double>> pairs = new ArrayList<>();
- List<String> words = docwords._1;
- int n = words.size();
- for (int i = 0; i < n; i++) {
- Tuple2<String, Long> worddoc = new Tuple2<>(words.get(i), docwords._2);
- pairs.add(new Tuple2<Tuple2<String, Long>, Double>(worddoc, 1.0));
- }
- return pairs.iterator();
- }
- }).reduceByKey(new Function2<Double, Double, Double>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Double call(Double first, Double second) throws Exception {
- return first + second;
- }
- });
- // cal word doc-numbers
- JavaPairRDD<String, Tuple2<List<Long>, List<Double>>> wordDocnumRDD = worddocNumRDD
- .mapToPair(new PairFunction<Tuple2<Tuple2<String, Long>, Double>, String, Tuple2<List<Long>, List<Double>>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(Tuple2<Tuple2<String, Long>, Double> worddocNum) throws Exception {
- List<Long> docs = new ArrayList<>();
- docs.add(worddocNum._1._2);
- List<Double> nums = new ArrayList<>();
- nums.add(worddocNum._2);
- Tuple2<List<Long>, List<Double>> docmums = new Tuple2<>(docs, nums);
- return new Tuple2<>(worddocNum._1._1, docmums);
- }
- });
- // trans to vector
- final int corporsize = (int) uniqueDocRDD.keys().count();
- JavaPairRDD<String, Vector> wordVectorRDD = wordDocnumRDD.reduceByKey(new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception {
- arg0._1.addAll(arg1._1);
- arg0._2.addAll(arg1._2);
- return new Tuple2<>(arg0._1, arg0._2);
- }
- }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception {
- int docsize = arg0._2._1.size();
- int[] intArray = new int[docsize];
- double[] doubleArray = new double[docsize];
- for (int i = 0; i < docsize; i++) {
- intArray[i] = arg0._2._1.get(i).intValue();
- doubleArray[i] = arg0._2._2.get(i).intValue();
- }
- Vector sv = Vectors.sparse(corporsize, intArray, doubleArray);
- return new Tuple2<>(arg0._1, sv);
- }
- });
-
- RowMatrix wordDocMatrix = new RowMatrix(wordVectorRDD.values().rdd());
-
- LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix();
- labeledRowMatrix.rowMatrix = wordDocMatrix;
- labeledRowMatrix.rowkeys = wordVectorRDD.keys().collect();
- labeledRowMatrix.colkeys = uniqueDocRDD.keys().collect();
- return labeledRowMatrix;
- }
-
- public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) {
- // Index word with unique IDs
- JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Iterator<String> call(List<String> arg0) throws Exception {
- return arg0.iterator();
- }
- }).distinct().zipWithIndex();
-
- //
- JavaPairRDD<Tuple2<String, String>, Double> docwordNumRDD = uniqueDocRDD.flatMapToPair(new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Iterator<Tuple2<Tuple2<String, String>, Double>> call(Tuple2<String, List<String>> docwords) throws Exception {
- List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<>();
- List<String> words = docwords._2;
- int n = words.size();
- for (int i = 0; i < n; i++) {
- Tuple2<String, String> worddoc = new Tuple2<>(docwords._1, words.get(i));
- pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0));
- }
- return pairs.iterator();
- }
- }).reduceByKey(new Function2<Double, Double, Double>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Double call(Double first, Double second) throws Exception {
- return first + second;
- }
- });
-
- //
- JavaPairRDD<String, Tuple2<String, Double>> wordDocnumRDD = docwordNumRDD.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0) throws Exception {
-
- Tuple2<String, Double> wordmums = new Tuple2<>(arg0._1._1, arg0._2);
- return new Tuple2<>(arg0._1._2, wordmums);
- }
- });
-
- //
-
- JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = wordDocnumRDD.leftOuterJoin(wordIDRDD);
-
- int wordsize = (int) wordIDRDD.count();
- JavaPairRDD<String, Vector> docVectorRDD = testRDD.mapToPair(new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception {
- Optional<Long> oid = arg0._2._2;
- Long wordId = (long) 0;
- if (oid.isPresent()) {
- wordId = oid.get();
- }
-
- List<Long> word = new ArrayList<>();
- word.add(wordId);
-
- List<Double> count = new ArrayList<>();
- count.add(arg0._2._1._2);
-
- Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<>(word, count);
-
- return new Tuple2<>(arg0._2._1._1, wordcount);
- }
-
- }).reduceByKey(new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception {
- arg0._1.addAll(arg1._1);
- arg0._2.addAll(arg1._2);
- return new Tuple2<>(arg0._1, arg0._2);
- }
- }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception {
- int docsize = arg0._2._1.size();
- int[] intArray = new int[docsize];
- double[] doubleArray = new double[docsize];
- for (int i = 0; i < docsize; i++) {
- intArray[i] = arg0._2._1.get(i).intValue();
- doubleArray[i] = arg0._2._2.get(i).intValue();
- }
- Vector sv = Vectors.sparse(wordsize, intArray, doubleArray);
- return new Tuple2<>(arg0._1, sv);
- }
- });
-
- RowMatrix docwordMatrix = new RowMatrix(docVectorRDD.values().rdd());
-
- LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix();
- labeledRowMatrix.rowMatrix = docwordMatrix;
- labeledRowMatrix.rowkeys = docVectorRDD.keys().collect();
- labeledRowMatrix.colkeys = wordIDRDD.keys().collect();
-
- return labeledRowMatrix;
- }
-
- /**
- * loadVectorFromCSV: Load term vector from csv file.
- *
- * @param spark
- * spark instance
- * @param csvFileName
- * csv matrix file
- * @param skipNum
- * the numbers of rows which should be skipped Ignore the top skip
- * number rows of the csv file
- * @return JavaPairRDD, each key is a term, and value is the vector of the
- * term in feature space.
- */
- public static JavaPairRDD<String, Vector> loadVectorFromCSV(SparkDriver spark, String csvFileName, int skipNum) {
- // skip the first line (header), important!
- JavaRDD<String> importRDD = spark.sc.textFile(csvFileName);
- JavaPairRDD<String, Long> importIdRDD = importRDD.zipWithIndex().filter(new Function<Tuple2<String, Long>, Boolean>() {
- /** */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Boolean call(Tuple2<String, Long> v1) throws Exception {
- if (v1._2 > (skipNum - 1)) {
- return true;
- }
- return false;
- }
- });
-
- if (importIdRDD.count() == 0) {
- return null;
- }
-
- return importIdRDD.mapToPair(new PairFunction<Tuple2<String, Long>, String, Vector>() {
- /** */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<String, Vector> call(Tuple2<String, Long> t) throws Exception {
- String[] fields = t._1.split(",");
- String word = fields[0];
- int fieldsize = fields.length;
- int nStart = 1;
- int nEnd = fieldsize - 1;
- if (fieldsize < 2) {
- nStart = 0;
- nEnd = 0;
- }
- String[] numfields = Arrays.copyOfRange(fields, nStart, nEnd);
-
- double[] nums = Stream.of(numfields).mapToDouble(Double::parseDouble).toArray();
- Vector vec = Vectors.dense(nums);
- return new Tuple2<>(word, vec);
- }
- });
- }
-
- /**
- * Convert vectorRDD to indexed row matrix.
- *
- * @param vecs
- * Vector RDD
- * @return IndexedRowMatrix
- */
- public static IndexedRowMatrix buildIndexRowMatrix(JavaRDD<Vector> vecs) {
- JavaRDD<IndexedRow> indexrows = vecs.zipWithIndex().map(new Function<Tuple2<Vector, Long>, IndexedRow>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public IndexedRow call(Tuple2<Vector, Long> docId) {
- return new IndexedRow(docId._2, docId._1);
- }
- });
- return new IndexedRowMatrix(indexrows.rdd());
- }
-
- /**
- * Transpose matrix
- *
- * @param indexedMatrix
- * spark indexed matrix
- * @return rowmatrix, each row is corresponding to the column in the original
- * matrix and vice versa
- */
- public static RowMatrix transposeMatrix(IndexedRowMatrix indexedMatrix) {
- return indexedMatrix.toCoordinateMatrix().transpose().toRowMatrix();
- }
-
- /**
- * Output matrix to a CSV file.
- *
- * @param matrix
- * spark row matrix
- * @param rowKeys
- * matrix row names
- * @param colKeys
- * matrix coloum names
- * @param fileName
- * csv file name
- */
- public static void exportToCSV(RowMatrix matrix, List<String> rowKeys, List<String> colKeys, String fileName) {
-
- if (matrix.rows().isEmpty()) {
- return;
- }
-
- int rownum = (int) matrix.numRows();
- int colnum = (int) matrix.numCols();
- List<Vector> rows = matrix.rows().toJavaRDD().collect();
-
- File file = new File(fileName);
- if (file.exists()) {
- file.delete();
- }
- try {
- file.createNewFile();
- FileWriter fw = new FileWriter(file.getAbsoluteFile());
- BufferedWriter bw = new BufferedWriter(fw);
- String coltitle = " Num" + ",";
- for (int j = 0; j < colnum; j++) {
- coltitle += "\"" + colKeys.get(j) + "\",";
- }
- coltitle = coltitle.substring(0, coltitle.length() - 1);
- bw.write(coltitle + "\n");
-
- for (int i = 0; i < rownum; i++) {
- double[] rowvlaue = rows.get(i).toArray();
- String row = rowKeys.get(i) + ",";
- for (int j = 0; j < colnum; j++) {
- row += rowvlaue[j] + ",";
- }
- row = row.substring(0, row.length() - 1);
- bw.write(row + "\n");
- }
-
- bw.close();
-
- } catch (IOException e) {
- e.printStackTrace();
-
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java
deleted file mode 100644
index 3fc45f4..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.FlatMapFunction;
-
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * ClassName: RDDUtil Function: Mudrod Spark RDD common methods
- */
-public class RDDUtil {
-
- public RDDUtil() {
- }
-
- /**
- * getAllWordsInDoc: Extracted all unique terms from all docs.
- *
- * @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from
- * that doc.
- * @return unique term list
- */
- public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) {
- JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Iterator<String> call(List<String> list) {
- return list.iterator();
- }
- }).distinct();
-
- return wordRDD;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java
deleted file mode 100644
index 1982996..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract;
-import gov.nasa.jpl.mudrod.driver.ESDriver;
-import gov.nasa.jpl.mudrod.driver.SparkDriver;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Properties;
-
-/**
- * Singular value decomposition
- */
-public class SVDUtil extends MudrodAbstract {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- // wordRDD: terms extracted from all documents
- JavaRDD<String> wordRDD;
- // svdMatrix: svd matrix
- private RowMatrix svdMatrix;
- // simMatrix: similarity matrix
- private CoordinateMatrix simMatrix;
-
- /**
- * Creates a new instance of SVDUtil.
- *
- * @param config the Mudrod configuration
- * @param es the Elasticsearch drive
- * @param spark the spark driver
- */
- public SVDUtil(Properties config, ESDriver es, SparkDriver spark) {
- super(config, es, spark);
- }
-
- /**
- * Build SVD matrix from docment-terms pairs.
- *
- * @param docwordRDD JavaPairRDD, key is short name of data set and values are terms in
- * the corresponding data set
- * @param svdDimension: Dimension of matrix after singular value decomposition
- * @return row matrix
- */
- public RowMatrix buildSVDMatrix(JavaPairRDD<String, List<String>> docwordRDD, int svdDimension) {
-
- RowMatrix svdMatrix = null;
- LabeledRowMatrix wordDocMatrix = MatrixUtil.createWordDocMatrix(docwordRDD);
- RowMatrix ifIdfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix.rowMatrix);
- svdMatrix = MatrixUtil.buildSVDMatrix(ifIdfMatrix, svdDimension);
- this.svdMatrix = svdMatrix;
- this.wordRDD = RDDUtil.getAllWordsInDoc(docwordRDD);
- return svdMatrix;
- }
-
- /**
- * Build svd matrix from CSV file.
- *
- * @param tfidfCSVfile tf-idf matrix csv file
- * @param svdDimension: Dimension of matrix after singular value decomposition
- * @return row matrix
- */
- public RowMatrix buildSVDMatrix(String tfidfCSVfile, int svdDimension) {
- RowMatrix svdMatrix = null;
- JavaPairRDD<String, Vector> tfidfRDD = MatrixUtil.loadVectorFromCSV(spark, tfidfCSVfile, 2);
- JavaRDD<Vector> vectorRDD = tfidfRDD.values();
-
- svdMatrix = MatrixUtil.buildSVDMatrix(vectorRDD, svdDimension);
- this.svdMatrix = svdMatrix;
-
- this.wordRDD = tfidfRDD.keys();
-
- return svdMatrix;
- }
-
- /**
- * Calculate similarity
- */
- public void calSimilarity() {
- CoordinateMatrix simMatrix = SimilarityUtil.calculateSimilarityFromMatrix(svdMatrix);
- this.simMatrix = simMatrix;
- }
-
- /**
- * Insert linkage triples to elasticsearch
- *
- * @param index index name
- * @param type linkage triple name
- */
- public void insertLinkageToES(String index, String type) {
- List<LinkageTriple> triples = SimilarityUtil.matrixToTriples(wordRDD, simMatrix);
- try {
- LinkageTriple.insertTriples(es, triples, index, type);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java
deleted file mode 100644
index 8ae9770..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package gov.nasa.jpl.mudrod.utils;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.Optional;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
-import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
-import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-import scala.Tuple2;
-
-import java.util.List;
-
-/**
- * Similarity and distrance calculation utilities
- */
-public class SimilarityUtil {
-
- public static final int SIM_COSINE = 3;
- public static final int SIM_HELLINGER = 2;
- public static final int SIM_PEARSON = 1;
- /**
- * CalSimilarityFromMatrix: Calculate term similarity from matrix.
- *
- * @param svdMatrix. Each row is corresponding to a term, and each column is
- * corresponding to a dimension of feature
- * @return CoordinateMatrix, each row is corresponding to a term, and each
- * column is also a term, the cell value is the similarity between the
- * two terms
- */
- public static CoordinateMatrix calculateSimilarityFromMatrix(RowMatrix svdMatrix) {
- JavaRDD<Vector> vecs = svdMatrix.rows().toJavaRDD();
- return SimilarityUtil.calculateSimilarityFromVector(vecs);
- }
-
- /**
- * CalSimilarityFromVector:Calculate term similarity from vector.
- *
- * @param vecs Each vector is corresponding to a term in the feature space.
- * @return CoordinateMatrix, each row is corresponding to a term, and each
- * column is also a term, the cell value is the similarity between the
- * two terms
- */
- public static CoordinateMatrix calculateSimilarityFromVector(JavaRDD<Vector> vecs) {
- IndexedRowMatrix indexedMatrix = MatrixUtil.buildIndexRowMatrix(vecs);
- RowMatrix transposeMatrix = MatrixUtil.transposeMatrix(indexedMatrix);
- return transposeMatrix.columnSimilarities();
- }
-
- /**
- * Calculate term similarity from vector.
- *
- * @param importRDD the {@link org.apache.spark.api.java.JavaPairRDD}
- * data structure containing the vectors.
- * @param simType the similarity calculation to execute e.g.
- * <ul>
- * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li>
- * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li>
- * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li>
- * </ul>
- * @return a new {@link org.apache.spark.api.java.JavaPairRDD}
- */
- public static JavaRDD<LinkageTriple> calculateSimilarityFromVector(JavaPairRDD<String, Vector> importRDD, int simType) {
- JavaRDD<Tuple2<String, Vector>> importRDD1 = importRDD.map(f -> new Tuple2<String, Vector>(f._1, f._2));
- JavaPairRDD<Tuple2<String, Vector>, Tuple2<String, Vector>> cartesianRDD = importRDD1.cartesian(importRDD1);
-
- return cartesianRDD.map(new Function<Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>>, LinkageTriple>() {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public LinkageTriple call(Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>> arg) {
- String keyA = arg._1._1;
- String keyB = arg._2._1;
-
- if (keyA.equals(keyB)) {
- return null;
- }
-
- Vector vecA = arg._1._2;
- Vector vecB = arg._2._2;
- Double weight = 0.0;
-
- if (simType == SimilarityUtil.SIM_PEARSON) {
- weight = SimilarityUtil.pearsonDistance(vecA, vecB);
- } else if (simType == SimilarityUtil.SIM_HELLINGER) {
- weight = SimilarityUtil.hellingerDistance(vecA, vecB);
- }
-
- LinkageTriple triple = new LinkageTriple();
- triple.keyA = keyA;
- triple.keyB = keyB;
- triple.weight = weight;
- return triple;
- }
- }).filter(new Function<LinkageTriple, Boolean>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Boolean call(LinkageTriple arg0) throws Exception {
- if (arg0 == null) {
- return false;
- }
- return true;
- }
- });
- }
-
- /**
- * MatrixtoTriples:Convert term similarity matrix to linkage triple list.
- *
- * @param keys each key is a term
- * @param simMatirx term similarity matrix, in which each row and column is a term and
- * the cell value is the similarity between the two terms
- * @return linkage triple list
- */
- public static List<LinkageTriple> matrixToTriples(JavaRDD<String> keys, CoordinateMatrix simMatirx) {
- if (simMatirx.numCols() != keys.count()) {
- return null;
- }
-
- // index words
- JavaPairRDD<Long, String> keyIdRDD = JavaPairRDD.fromJavaRDD(keys.zipWithIndex().map(new Function<Tuple2<String, Long>, Tuple2<Long, String>>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<Long, String> call(Tuple2<String, Long> docId) {
- return docId.swap();
- }
- }));
-
- JavaPairRDD<Long, LinkageTriple> entriesRowRDD = simMatirx.entries().toJavaRDD().mapToPair(new PairFunction<MatrixEntry, Long, LinkageTriple>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<Long, LinkageTriple> call(MatrixEntry t) throws Exception {
- LinkageTriple triple = new LinkageTriple();
- triple.keyAId = t.i();
- triple.keyBId = t.j();
- triple.weight = t.value();
- return new Tuple2<>(triple.keyAId, triple);
- }
- });
-
- JavaPairRDD<Long, LinkageTriple> entriesColRDD = entriesRowRDD.leftOuterJoin(keyIdRDD).values().mapToPair(new PairFunction<Tuple2<LinkageTriple, Optional<String>>, Long, LinkageTriple>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public Tuple2<Long, LinkageTriple> call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception {
- LinkageTriple triple = t._1;
- Optional<String> stra = t._2;
- if (stra.isPresent()) {
- triple.keyA = stra.get();
- }
- return new Tuple2<>(triple.keyBId, triple);
- }
- });
-
- JavaRDD<LinkageTriple> tripleRDD = entriesColRDD.leftOuterJoin(keyIdRDD).values().map(new Function<Tuple2<LinkageTriple, Optional<String>>, LinkageTriple>() {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- @Override
- public LinkageTriple call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception {
- LinkageTriple triple = t._1;
- Optional<String> strb = t._2;
- if (strb.isPresent()) {
- triple.keyB = strb.get();
- }
- return triple;
- }
- });
- return tripleRDD.collect();
- }
-
- /**
- * Calculate similarity (Hellinger Distance) between vectors
- *
- * @param vecA initial vector from which to calculate a similarity
- * @param vecB second vector involved in similarity calculation
- * @return similarity between two vectors
- */
- public static double hellingerDistance(Vector vecA, Vector vecB) {
- double[] arrA = vecA.toArray();
- double[] arrB = vecB.toArray();
-
- double sim = 0.0;
-
- int arrsize = arrA.length;
- for (int i = 0; i < arrsize; i++) {
- double a = arrA[i];
- double b = arrB[i];
- double sqrtDiff = Math.sqrt(a) - Math.sqrt(b);
- sim += sqrtDiff * sqrtDiff;
- }
-
- sim = sim / Math.sqrt(2);
-
- return sim;
- }
-
- /**
- * Calculate similarity (Pearson Distance) between vectors
- *
- * @param vecA initial vector from which to calculate a similarity
- * @param vecB second vector involved in similarity calculation
- * @return similarity between two vectors
- */
- public static double pearsonDistance(Vector vecA, Vector vecB) {
- double[] arrA = vecA.toArray();
- double[] arrB = vecB.toArray();
-
- int viewA = 0;
- int viewB = 0;
- int viewAB = 0;
-
- int arrsize = arrA.length;
- for (int i = 0; i < arrsize; i++) {
- if (arrA[i] > 0) {
- viewA++;
- }
-
- if (arrB[i] > 0) {
- viewB++;
- }
-
- if (arrB[i] > 0 && arrA[i] > 0) {
- viewAB++;
- }
- }
- return viewAB / (Math.sqrt(viewA) * Math.sqrt(viewB));
- }
-
- /**
- * calculate similarity between vectors
- *
- * @param vecA initial vector from which to calculate a similarity
- * @param vecB second vector involved in similarity calculation
- * @return similarity between two vectors
- */
- public static double cosineDistance(Vector vecA, Vector vecB) {
- return 1;
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java
deleted file mode 100644
index 3fcd95e..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes utilities classes for calculating similarity and
- * parsing HTTP request
- */
-package gov.nasa.jpl.mudrod.utils;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java
deleted file mode 100644
index f4a8b86..0000000
--- a/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This package includes web log pre-processing, processing, and data structure
- * classes.
- */
-package gov.nasa.jpl.mudrod.weblog;
\ No newline at end of file