You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC
svn commit: r1555944 [3/11] - in /opennlp/sandbox/opennlp-similarity/src:
main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/
main/java/opennlp/tools/apps/contentgen/multithreaded/
main/java/opennlp/tools/apps/relevanceVocabs/ main/j...
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+public class FeatureSpaceCoverageProcessor {
+
+ public Map<String, Integer> paramMap = new HashMap<String, Integer>();
+ public String[] header;
+ String[] attributes;
+
+ public FeatureSpaceCoverageProcessor (){
+
+ }
+
+ public void initParamMap(String[] attributes, String[] header){
+ this.header = header;
+ this.attributes = attributes;
+ for(int m=0; m<header.length; m++){
+ paramMap.put(header[m], m);
+ }
+ }
+
+
+ // distance between array and array
+ public Float calcDistance(String[] seed, String[] candidate) throws Exception {
+ if (paramMap.isEmpty())
+ throw new Exception("paramMap.isEmpty()");
+
+ Float score = 0f;
+ int p1 = paramMap.get("First Level Category");
+ int p2 = paramMap.get("Second Level Category");
+ if (seed[p1].equals(candidate[p1])) {
+ if (seed[p2].equals(candidate[p2]))
+ score = score+0.0000001f;
+ else
+ score = score+0.01f;
+ } else return 100000f;
+
+ try {
+ int p3 = paramMap.get("Latitude");
+ int p4 = paramMap.get("Longitude");
+ Double latDiff = Math.abs(Double.parseDouble(seed[p3]) - Double.parseDouble(candidate[p3]));
+ Double longDiff = Math.abs(Double.parseDouble(seed[p4]) - Double.parseDouble(candidate[p4]));
+ if (latDiff>1 || longDiff>1)
+ return 1000000f;
+ else
+ score+= latDiff.floatValue()/100.0f + longDiff.floatValue()/100.0f;
+ } catch (Exception e) {
+ return 1000000f;
+ }
+
+
+ return score;
+ }
+
+ // distance between matrix and array
+ public Float calcDistance(String[][] seed, String[] candidate) throws Exception {
+ if (paramMap.isEmpty())
+ throw new Exception("paramMap.isEmpty()");
+
+ Float score = 0f, catScore = 10000f, currCatScore=10000000f;
+
+ int p1 = paramMap.get("First Level Category");
+ int p2 = paramMap.get("Second Level Category");
+ for(int v=0; v<seed[0].length; v++){
+ if (seed[p1][v].equals(candidate[p1])) {
+ if (seed[p2][v].equals(candidate[p2]))
+ currCatScore = 0.0000001f;
+ else
+ currCatScore = 0.01f;
+ }
+ if ( catScore > currCatScore) // if found closer, update
+ catScore = currCatScore;
+ }
+ score = catScore;
+ if (score > 1000000f)
+ return 10000000f;
+
+ Float latLongScore = 100000f, currLatLongScore = 10000000f;
+ for(int v=0; v<seed[0].length; v++){
+ try {
+ int p3 = paramMap.get("Latitude");
+ int p4 = paramMap.get("Longitude");
+ if (seed[p3][v].equals("") || seed[p4][v].equals("")
+ || candidate[p3].equals("") || candidate[p4].equals(""))
+ continue;
+ Double latDiff = Math.abs(Double.parseDouble(seed[p3][v]) - Double.parseDouble(candidate[p3]));
+ Double longDiff = Math.abs(Double.parseDouble(seed[p4][v]) - Double.parseDouble(candidate[p4]));
+ if (!(latDiff>1 || longDiff>1))
+ currLatLongScore = latDiff.floatValue()/100.0f + longDiff.floatValue()/100.0f;
+ } catch (Exception e) {
+ //return 1000000f;
+ }
+ if (latLongScore > currLatLongScore)
+ latLongScore = currLatLongScore;
+
+ }
+ if (latLongScore> 10000)
+ return 10000f;
+ score+=latLongScore;
+ return score;
+ }
+
+ public Integer getIdForAttributeName(String key){
+ Integer res = paramMap.get(key);
+ try {
+ res.toString();
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.out.println("wrong key"+key);
+ }
+ return res;
+
+ }
+
+ public String getAttribNameForId(Integer id){
+ return header[id];
+ }
+
+
+
+
+ public Map<String, String> computeIntersection(String[] line1,
+ String[] line2) {
+
+ Map<String, String> attr_value = new HashMap<String, String>();
+ for(String attr: attributes){
+ int attrIndex = getIdForAttributeName(attr);
+ String v1 = line1[attrIndex].toLowerCase().replace("\"", "").replace(", ", ", ").replace(", ", ",");;
+ String v2 = line2[attrIndex].toLowerCase().replace("\"", "").replace(", ", ", ").replace(", ", ",");;
+ String valArr1Str = StringUtils.substringBetween(v1, "{", "}");
+ String valArr2Str = StringUtils.substringBetween(v2, "{", "}");
+ if (valArr1Str==null || valArr2Str==null) { // we assume single value, not an array of values
+ if (v1.equals(v2)){
+ attr_value.put(attr, v1);
+ }
+ }
+ else {
+ valArr1Str = valArr1Str.replaceAll(", ", ",");
+ valArr2Str = valArr2Str.replaceAll(", ", ",");
+ String[] valArr1 = valArr1Str.split(",");
+ String[] valArr2 = valArr2Str.split(",");
+ List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1));
+ List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2));
+ valList1.retainAll(valList2);
+ /* verification of coverage
+ valList1.retainAll(valList2);
+
+ List<String> vl1 = new ArrayList<String>(Arrays.asList(valArr1));
+ valList1.retainAll(vl1); */
+
+ if (!valList1.isEmpty()){
+ v1 = "{"+valList1.toString().replace("["," ").replace("]", " ").trim()+"}";
+ attr_value.put(attr, v1);
+ }
+
+ }
+ }
+ return attr_value;
+ }
+
+
+ public boolean ruleCoversCase(Map<String, String> attr_value, String[] line){
+ boolean soFarCovers = true;
+ for(String attr: attributes){
+ int attrIndex = getIdForAttributeName(attr);
+ String rule = attr_value.get(attr);
+ if (rule == null)
+ continue; // no constraint
+ rule = rule.toLowerCase().replace("\"", "").replace(", ", ",").replace(", ", ",");
+ String vCase = line[attrIndex].toLowerCase().replace("\"", "").replace(", ", ",").replace(", ", ",");
+ if (vCase==null){// rule for this attribute exists but case has no value
+ soFarCovers = false;
+ return false;
+ }
+
+ String valArrCaseStr = StringUtils.substringBetween(vCase, "{", "}");
+ String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}");
+ if (valArrCaseStr==null || valArrRuleStr==null) { // we assume single value, not an array of values
+ if (!vCase.equals(rule)){
+ soFarCovers = false;
+ return false;
+ }
+ }
+ else {
+ String[] valArrCase = valArrCaseStr.split(",");
+ String[] valArrRule = valArrRuleStr.split(",");
+ List<String> valListCase = new ArrayList<String>(Arrays.asList(valArrCase));
+ List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule));
+
+ int ruleSize = valListRule.size();
+ //System.out.println(valListRule);
+ //System.out.println(valListCase);
+
+ // rule members are subset of case
+ valListRule.retainAll(valListCase);
+
+ //System.out.println(valListRule);
+
+ if (ruleSize != valListRule.size()){
+ soFarCovers = false;
+ return false;
+ }
+
+
+
+ }
+ }
+ return soFarCovers;
+ }
+
+ public boolean ruleCoversRule(Map<String, String> attr_value, Map<String, String> line){
+ boolean soFarCovers = true;
+ for(String attr: attributes){
+ int attrIndex = getIdForAttributeName(attr);
+ String rule = attr_value.get(attr);
+ if (rule == null)
+ continue; // no constraint
+
+ String vRuleBeingCovered = line.get(attr);
+ if (vRuleBeingCovered==null){// rule for this attribute exists but RuleBeingCovered has no value
+ soFarCovers = false;
+ return false;
+ }
+
+ String valArrRuleBeingCoveredStr = StringUtils.substringBetween(vRuleBeingCovered, "{", "}");
+ String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}");
+ if (valArrRuleBeingCoveredStr==null || valArrRuleStr==null) { // we assume single value, not an array of values
+ if (!vRuleBeingCovered.equals(rule)){
+ soFarCovers = false;
+ return false;
+ }
+ }
+ else {
+ String[] valArrRuleBeingCovered = valArrRuleBeingCoveredStr.split(",");
+ String[] valArrRule = valArrRuleStr.split(",");
+ List<String> valListRuleBeingCovered = new ArrayList<String>(Arrays.asList(valArrRuleBeingCovered));
+ List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule));
+ for(String r: valListRule){
+ if (!strListContainsMember(valListRuleBeingCovered, r)){
+ soFarCovers = false;
+ return false;
+ }
+ }
+
+ }
+ }
+ return soFarCovers;
+ }
+
+ public Map<String, String> computeIntersection(
+ Map<String, String> rule1, Map<String, String> rule2) {
+ Map<String, String> attr_value = new HashMap<String, String>();
+ for(String attr: attributes){
+ int attrIndex = getIdForAttributeName(attr);
+ String v1 = rule1.get(attr);
+ String v2 = rule2.get(attr);
+ if (v1==null || v2==null)
+ continue;
+ String valArr1Str = StringUtils.substringBetween(v1, "{", "}");
+ String valArr2Str = StringUtils.substringBetween(v2, "{", "}");
+ if (valArr1Str==null || valArr2Str==null) { // we assume single value, not an array of values
+ if (v1.equals(v2)){
+ attr_value.put(attr, v1);
+ }
+ }
+ else {
+ valArr1Str = valArr1Str.replaceAll(", ", ",");
+ valArr2Str = valArr2Str.replaceAll(", ", ",");
+ String[] valArr1 = valArr1Str.split(",");
+ String[] valArr2 = valArr2Str.split(",");
+ List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1));
+ List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2));
+ valList1.retainAll(valList2);
+ if (!valList1.isEmpty()){
+ v1 = "{"+valList1.toString().replace("["," ").replace("]", " ").trim()+"}";
+ attr_value.put(attr, v1);
+ }
+
+ }
+ }
+ return attr_value;
+ }
+
+ private boolean strListContainsMember(List<String> valListCase, String r) {
+ boolean bContains = false;
+ for(String m: valListCase){
+ if (m.startsWith(r) || r.startsWith(m))
+ return true;
+
+ }
+ return false;
+ }
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,361 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/*
+ *
+ * The rule is in the form
+The report also shows how many positive cases are covered by this rule (should be 0) and how many negative cases
+are covered by this rule (should be above 1)
+
+The rule
+{plugin_number=3, service_type=all, mime_type_number=11, review_status=pass} 0 192
+
+should be read as
+
+plugin_number=3 & service_type=all & mime_type_number=11 & review_status=pass
+
+For a single-attribute, its value should be the one from this rule. For a multi-value attribute, the set of values in the case
+should INCLUDE the set of values from the rule.
+
+The rule checking that a case belongs to the negative set is a disjunction of all rules in the result file.
+
+input: two data files, one is negative set and another is positive set.
+in the argument, just the negative file needs to be specified:
+".../negativeSet1.csv",
+then the system assumes that the filename for negative is obtained by replacing 'negative' with 'positive'
+".../positiveSet1.csv",
+
+The set of attribute in analysis is hard coded
+
+
+ */
+public class IntersectionSetBuilder{
+ private FeatureSpaceCoverageProcessor distProcessorPos, distProcessorNeg;
+ private float percentageOfAllowedSetCover = 0.001f;
+ //The set of attribute in analysis is hard coded
+ String[] fieldsToAggr = new String[]{
+ "reason_code", "risk_rating", "service_type", "device_match_result", "device_result", "http_referer", "device_id_reason_code",
+ "review_status", "tcp_os_sig_ttl", "tcp_connection_type",
+ "mime_type_number", "plugin_number", "http_connection_type", "device_last_event", "http_connection_type"
+
+
+ };
+ public IntersectionSetBuilder() {};
+
+ /*
+ * Takes a file generated by public String ruleFormer(String dataFile)
+ * and performs verification of coverage for positive and negative set, as well as dedupe of rules
+ * The input for negative positive data set is the same as the above function.
+ * The second argument is the rule file generated by the above.
+ * Outputs the verified rule file.
+ */
+
+ public void ruleVerifier(String dataFile, String ruleFile){
+
+
+ List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile);
+ List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive"));
+ distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();
+ distProcessorNeg.initParamMap( fieldsToAggr, negativeSet.get(0));
+ distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));
+ negativeSet.remove(0); positiveSet.remove(0);
+
+ List<String[]> ruleStrings = ProfileReaderWriter.readProfiles(ruleFile);
+ List<Map<String, String>> rules = new ArrayList<Map<String, String>>(), dedupedRules = new ArrayList<Map<String, String>>() ;
+ for(String[] l : ruleStrings){
+ Map<String, String> rule = new HashMap<String, String>();
+ String lstr = l[0].substring(1, l[0].length()-1);
+ String[] ruleStr= lstr.split(",");
+ for(String attr_valueStr: ruleStr){
+ String[] attr_value = attr_valueStr.split("=");
+ if (attr_value.length==2)
+ rule.put(attr_value[0].trim(), attr_value[1].trim());
+ else if (attr_value.length==1)
+ rule.put(attr_value[0].trim(),"");
+ else
+ System.err.println("Problem parsing rule file "+lstr);
+ }
+ rules.add(rule);
+ }
+
+
+ for(int i=0; i<rules.size(); i++){
+ boolean bCovered = false;
+
+ for(int j=i+1; j<rules.size(); j++){
+ if (distProcessorNeg.ruleCoversRule(rules.get(j), rules.get(i))){
+ bCovered = true;
+ }
+ }
+ if (!bCovered)
+ dedupedRules.add(rules.get(i));
+ }
+
+ rules = dedupedRules;
+
+ List<String[]> output = new ArrayList<String[]>();
+ output.add(new String[]{"rule", "# covers positive", "# covers negative"});
+ for(Map<String, String> rule: rules){
+ int countCoverNeg = 0, countCoverPos=0;
+ for(String[] line: positiveSet){
+ if (distProcessorPos.ruleCoversCase(rule, line)){
+ countCoverPos++;
+ }
+ }
+ for(String[] line: negativeSet){
+ if (distProcessorNeg.ruleCoversCase(rule, line)){
+ countCoverNeg++;
+ }
+
+ }
+ output.add(new String[]{rule.toString(), new Integer(countCoverPos).toString(), new Integer(countCoverNeg).toString()});
+
+ }
+ ProfileReaderWriter.writeReport(output, ruleFile+"Verif1.csv");
+ }
+
+
+ /*
+ * Takes one argument for negative training set file, assumes the positive filename is formed by replacing 'negative'->'positive'
+ * Outputs the filename with generated rules
+ *
+ */
+ public String ruleFormer(String dataFile){
+
+
+ List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile);
+ List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive"));
+ distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();
+ distProcessorNeg.initParamMap( fieldsToAggr, negativeSet.get(0));
+ distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));
+ negativeSet.remove(0); positiveSet.remove(0);
+
+ List<Map<String, String>> intersections = formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(negativeSet, positiveSet);
+ List<Map<String, String>> superIntersections = formIntersections(intersections, negativeSet, positiveSet);
+
+ List<String[]> output = new ArrayList<String[]>();
+ for(Map<String, String> rule: superIntersections){
+ int countCover = 0;
+ for(String[] line: positiveSet){
+ if (distProcessorPos.ruleCoversCase(rule, line)){
+ countCover++;
+ }
+ }
+ output.add(new String[]{rule.toString(), new Integer(countCover).toString()});
+
+ }
+ String outputFile = "learnedRulesForNegativeSetJune23-1.csv";
+ ProfileReaderWriter.writeReport(output, outputFile);
+ return outputFile;
+
+ }
+
+ private List<Map<String, String>> formIntersections(List<Map<String, String>> intersectionsIn, List<String[]> negativeSet, List<String[]> positiveSet) {
+ List<Map<String, String>> intersectionsNew = new ArrayList<Map<String, String>>();
+ for(int i=0; i<intersectionsIn.size(); i++){
+ for(int j=i+1; j<intersectionsIn.size(); j++){
+ Map<String, String> intersection = distProcessorNeg.computeIntersection(intersectionsIn.get(i), intersectionsIn.get(j));
+ if (intersection.isEmpty())
+ continue;
+
+ int countCover = 0;
+ for(String[] line: positiveSet){
+ if (distProcessorPos.ruleCoversCase(intersection, line)){
+ //countCover++;
+ countCover = 10000000;
+ break;
+ }
+ }
+ float cover = (float)countCover/(float)positiveSet.size();
+ if (!(cover<this.percentageOfAllowedSetCover))
+ continue;
+
+ List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();
+ boolean nothingCoversThisRule = true;
+ for(Map<String, String> intersChecker: intersectionsIn){ // more general rule covers more specific
+ if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){
+ nothingCoversThisRule = false;
+ break;
+ } // now check if this new rule defeats built rules
+ if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){
+ rulesToBeRemoved.add(intersChecker);
+ }
+ }
+ if(nothingCoversThisRule){
+ intersectionsNew.add(intersection);
+ intersectionsNew.removeAll(rulesToBeRemoved);
+ }
+ }
+ }
+ intersectionsNew.addAll(intersectionsIn);
+ return intersectionsNew;
+ }
+
+ private List<Map<String, String>> formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(List<String[]> negativeSet, List<String[]> positiveSet){
+ List<Map<String, String>> intersections = new ArrayList<Map<String, String>>();
+
+ for(int i=0; i<negativeSet.size() && i<1000; i++){
+ for(int j=i+1; j<negativeSet.size(); j++){
+ Map<String, String> intersection = distProcessorNeg.computeIntersection(negativeSet.get(i), negativeSet.get(j));
+ if (intersection.isEmpty())
+ continue;
+
+ /* temporary code that formed rule covers at least 2 cases
+ int countCoverNeg=0;
+ for(String[] line: negativeSet){
+ if (distProcessorNeg.ruleCoversCase(intersection, line)){
+ countCoverNeg++;
+ }
+
+ }
+ if (countCoverNeg<2){
+ System.err.println("A rule formed but it does not cover its origin! "+intersection);
+ distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(i));
+ distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(j));
+ } */
+
+
+
+ int countCover = 0;
+ for(String[] line: positiveSet){
+ if (distProcessorPos.ruleCoversCase(intersection, line)){
+ //countCover++;
+ countCover = 10000000;
+ break;
+ }
+ }
+ float cover = (float)countCover/(float)positiveSet.size();
+ if (!(cover<this.percentageOfAllowedSetCover))
+ continue;
+
+ List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();
+ boolean nothingCoversThisRule = true;
+ for(Map<String, String> intersChecker: intersections){ // more general rule covers more specific
+ if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){
+ nothingCoversThisRule = false;
+ break;
+ } // now check if this new rule defeats built rules
+ if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){
+ rulesToBeRemoved.add(intersChecker);
+ }
+ }
+ if(nothingCoversThisRule){
+ intersections.add(intersection);
+ intersections.removeAll(rulesToBeRemoved);
+ }
+ }
+ }
+ return intersections;
+ }
+
+ private List<Map<String, String>> filterIntersectionsByOppositeTrainingSet(List<Map<String, String>> intersections, List<String[]> positiveSet){
+ List<Map<String, String>> filteredIntersections = new ArrayList<Map<String, String>>();
+ for(Map<String, String> rule: intersections){
+ int countCover = 0;
+ for(String[] line: positiveSet){
+ if (!distProcessorPos.ruleCoversCase(rule, line))
+ countCover++;
+ }
+ if ((float)countCover/(float)positiveSet.size()<this.percentageOfAllowedSetCover)
+ filteredIntersections.add(rule);
+
+ }
+ return filteredIntersections;
+ }
+
+ public boolean applyRule(String[] sample){
+ return true;
+ // todo implement singleton which reads rule file and applies them
+
+ }
+
+ public static void main(String[] args){
+ IntersectionSetBuilder iBuilder = new IntersectionSetBuilder ();
+
+ // builds the set of rules
+ String resFile = iBuilder.ruleFormer("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv");
+ // verifies and cleans the rules
+ iBuilder.ruleVerifier("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv",
+ "C:/workspace/relevanceEngine/learnedRulesForNegativeSetJune23-1.csv");
+
+ }
+
+}
+
+/*
+ *
+ * datetime
+browser_language
+browser_string
+device_first_seen
+device_match_result
+http_os_signature
+http_os_sig_raw
+os
+device_id_reason_code
+true_ip
+proxy_ip
+http_os_sig_adv_mss
+http_os_sig_snd_mss
+http_os_sig_rcv_mss
+http_os_sig_ttl
+http_connection_type
+device_last_event
+flash_lang
+flash_os
+flash_version
+os_fonts_number
+plugin_adobe_acrobat
+plugin_flash
+plugin_silverlight
+plugin_windows_media_player
+profiling_datetime
+screen_res
+tcp_os_signature
+tcp_os_sig_raw
+time_zone
+time_zone_dst_offset
+profile_api_timedelta
+mime_type_number
+plugin_number
+plugin_quicktime
+plugin_java
+fuzzy_device_id_confidence
+fuzzy_device_match_result
+fuzzy_device_last_event
+fuzzy_device_first_seen
+true_ip_city
+true_ip_first_seen
+true_ip_geo
+true_ip_latitude
+true_ip_longitude
+account_email_first_seen
+shipping_address_first_seen
+tcp_os_ sig_ttl
+tcp_connection_type
+page_time_on
+policy_score
+reason_code
+review_status
+risk_rating
+ */
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import au.com.bytecode.opencsv.CSVReader;
+import au.com.bytecode.opencsv.CSVWriter;
+
+public class ProfileReaderWriter {
+ public static List<String[]> readProfiles(String filename) {
+ CSVReader reader = null;
+ List<String[]> profiles = null;
+ try {
+ reader = new CSVReader(new FileReader(filename), ',');
+ profiles = reader.readAll();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException ioe) {
+ ioe.printStackTrace();
+ }
+ return profiles;
+ }
+
+ public static List<String[]> readProfiles(String filename, char delimiter) {
+ CSVReader reader = null;
+ List<String[]> profiles = null;
+ try {
+ reader = new CSVReader(new FileReader(filename), delimiter);
+ profiles = reader.readAll();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException ioe) {
+ ioe.printStackTrace();
+ }
+ return profiles;
+ }
+
+ public static void writeReportArr( String[][] allLines, String reportName){
+ List<String[]> rep = new ArrayList<String[]>();
+ for(String[] line: allLines){
+ rep.add(line);
+ }
+ writeReport( rep, reportName);
+ }
+
+ public static void writeReport( List<String[]> allLines, String reportName){
+ CSVWriter writer = null;
+ try {
+ writer = new CSVWriter(new PrintWriter(reportName));
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ writer.writeAll(allLines);
+
+ try {
+ writer.flush();
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void writeReport( List<String[]> allLines, String reportName, char delimiter){
+ CSVWriter writer = null;
+ try {
+ writer = new CSVWriter(new PrintWriter(reportName), delimiter, delimiter, delimiter);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ writer.writeAll(allLines);
+
+ try {
+ writer.flush();
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void appendReport( List<String[]> allLines, String reportName, char delimiter){
+ List<String[]> previous;
+ try {
+ previous = readProfiles(reportName);
+ allLines.addAll(previous);
+ } catch (Exception e1) {
+ System.out.println("Creating file "+reportName);
+ }
+
+ CSVWriter writer = null;
+ try {
+ writer = new CSVWriter(new PrintWriter(reportName), delimiter, delimiter, delimiter);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ writer.writeAll(allLines);
+
+ try {
+ writer.flush();
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void writeReportListStr(List<String> res, String string) {
+ // TODO Auto-generated method stub
+
+ }
+
+ public static void main(String[] args){
+ List<String[]> allLines = new ArrayList<String[]>();
+ allLines.add(new String[] {"aa " , " bb", "ccc" });
+ ProfileReaderWriter.writeReport( allLines, "reportName.txt", ' ');
+
+ }
+
+
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+public class TreeKernelRunner {
+ private void runEXE(String[] command, String runPath){
+ Runtime r = Runtime.getRuntime();
+ Process mStartProcess = null;
+ try {
+ mStartProcess = r.exec( command, null, new File(runPath));
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ StreamLogger outputGobbler = new StreamLogger(mStartProcess.getInputStream());
+ outputGobbler.start();
+
+ try {
+ int returnCode = mStartProcess.waitFor();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ public void runLearner(String dir, String learning_file, String model_file)
+ {
+ dir = dir.replace('/', '\\');
+
+ if (!dir.endsWith("\\"))
+ dir+="\\";
+ String[] runString = new String[]{dir+"svm_learn.exe","-t", "5", dir+learning_file, dir+model_file};
+ runEXE(runString, dir);
+ }
+
+
+ //svm_classify example_file model_file predictions_file
+ public void runClassifier(String dir, String example_file, String model_file, String predictions_file)
+ {
+ dir = dir.replace('/', '\\');
+
+ if (!dir.endsWith("\\"))
+ dir+="\\";
+ String[] runString = new String[]{dir+"svm_classify.exe", dir+example_file, dir+model_file, dir+predictions_file};
+ runEXE(runString, dir);
+ }
+
+ class StreamLogger extends Thread{
+
+ private InputStream mInputStream;
+
+ public StreamLogger(InputStream is) {
+ this.mInputStream = is;
+ }
+
+ public void run() {
+ try {
+ InputStreamReader isr = new InputStreamReader(mInputStream);
+ BufferedReader br = new BufferedReader(isr);
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ System.out.println(line);
+ }
+ } catch (IOException ioe) {
+ ioe.printStackTrace();
+ }
+ }
+
+ }
+
+ public static void main(String[] args){
+ TreeKernelRunner runner = new TreeKernelRunner();
+ runner.runLearner("C:\\stanford-corenlp\\tree_kernel\\", "training.txt", "arg0.model1.txt");
+ runner.runClassifier("C:\\stanford-corenlp\\tree_kernel\\", "arg0.test", "arg0.model1.txt", "arg0.output1.txt");
+ }
+}
+
+ /*
+exec:
+
+public Process exec(String command, String envp[], File dir)
+
+
+
+ @param command a specified system command.
+ @param envp array of strings, each element of which
+ has environment variable settings in format
+ <i>name</i>=<i>value</i>.
+ @param dir the working directory of the subprocess, or
+ <tt>null</tt> if the subprocess should inherit
+ the working directory of the current process.
+
+ Ãâ ôøÃ�ÃâÃâ¬Ã¸Ã±ÃÆÃâøòõ ôòð exe-Ãâðùûð: svm_learn.exe ø svm_classify.exe.
+
+1. svm_learn.exe ñõÃâ¬ÃµÃâ Ãâðùû Ã� ÿÃâ¬Ã¸Ã¼ÃµÃâ¬Ã°Ã¼Ã¸, þñÃâ¬Ã°Ã±Ã°ÃâÃâ¹Ã²Ã°ÃµÃâ õóþ, Ã�ÃâÃâ¬Ã¾Ã¸Ãâ Ãâðùû model ü ÿÃâ¬Ã°Ã²Ã¸Ã»Ã°Ã¼Ã¸ þñÃÆÃâ¡ÃµÃ½Ã¸Ãµ.
+
+ßÃâ¬Ã¸Ã¼ÃµÃâ¬Ã⹠÷ðÿÃÆÃ�úð:
+svm_learn -t 5 learning_file model_file - Ã�Ãâþ Ã�ðüÃâ¹Ã¹ ÿÃâ¬Ã¾Ã�Ãâþù òðÃâ¬Ã¸Ã°Ã½Ãâ ÷ðÿÃÆÃ�úð, SubSetTreeKernel (ôþÿÃÆÃ�úðÎÃâÃ�Ã� Ãâ¬Ã°Ã·Ãâ¬Ãâ¹Ã²Ã⹠ÿÃâ¬Ã¸ þñÃâ¦Ã¾Ã´Ãµ ôõÃâ¬ÃµÃ²ÃÅõò)
+
+svm_learn -t 5 -D 0 learning_file model_file - ôÃâ¬ÃÆóþù òðÃâ¬Ã¸Ã°Ã½Ãâ Ã�ôÃâ¬Ã°, SubTreeKernel
+
+ßÃâ¬Ã¸Ã¼ÃµÃ⬠Ãâðùûð ûõöøÃâ ýð õóþ Ã�ÃâÃâ¬Ã°Ã½Ã¸Ãâ¡ÃºÃµ. âðü öõ þÿøÃ�ðýøõ ÿðÃâ¬Ã°Ã¼ÃµÃâÃâ¬Ã¾Ã².
+
+2. svm_classify.exe ñõÃâ¬ÃµÃâ Ãâðùû Ã� ÃâõÃ�ÃâþòÃâ¹Ã¼Ã¸ ÿÃâ¬Ã¸Ã¼ÃµÃâ¬Ã°Ã¼Ã¸, Ãâðùû Ã� üþôõûÃÅÃŽ, ÿþÃ�ÃâÃâ¬Ã¾ÃµÃ½Ã½Ãâ¹Ã¹ svm_learn, ø ÷ðÿøÃ�Ãâ¹Ã²Ã°ÃµÃâ Ãâ¬ÃµÃ·ÃÆûÃÅÃâðÃâÃ⹠þñÃÆÃâ¡ÃµÃ½Ã¸Ã� ò Ãâðùû predictions_file.
+
+ÃâðÿÃÆÃ�ú: svm_classify example_file model_file predictions_file
+
+äðùû øüõõÃâ ÃâþÃâ öõ ÃâþÃâ¬Ã¼Ã°Ãâ, Ãâ¡Ãâþ ø òÃâ¦Ã¾Ã´Ã½Ãâ¹Ãµ ÿÃâ¬Ã¸Ã¼ÃµÃâ¬Ãâ¹. ÞñÃâ¬Ã°Ã·ÃµÃâ ûõöøÃâ ò ðÃâ¬Ãâ¦Ã¸Ã²Ãµ ýð Ã�ÃâÃâ¬Ã°Ã½Ã¸Ãâ¡ÃºÃµ ÃÅþÃ�úøÃâÃâø.
+ÃÅþöýþ Ã�Ãâ¬Ã°Ã·ÃÆ Ã¶Ãµ ÃÆúð÷Ãâ¹Ã²Ã°ÃâÃÅ, ú úðúþüÃÆ ÃºÃ»Ã°Ã�Ã�ÃÆ Ã¾ÃâýþÃ�øÃâÃ�Ã� ÿÃâ¬Ã¸Ã¼ÃµÃ⬠(1 øûø -1 ò ýðÃâ¡Ã°Ã»Ãµ Ã�ÃâÃâ¬Ã¾ÃºÃ¸). Ãâ Ã�Ãâþü Ã�ûÃÆÃâ¡Ã°Ãµ ÃâþÃâ¡Ã½Ã¾Ã�ÃâÊø ÿþûýþÃâð þÃâ õýøòðÎÃâÃ�Ã� ðòÃâþüðÃâøÃâ¡Ã
µÃ�úø. ÃËûø Ã�ÃâðòøÃâÃÅ Ãâðü 0.
+ */
\ No newline at end of file
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java Mon Jan 6 17:48:30 2014
@@ -16,6 +16,7 @@
*/
package opennlp.tools.nl2code;
+import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -28,11 +29,19 @@ import opennlp.tools.textsimilarity.chun
public class NL2Obj {
ObjectControlOp prevOp;
- public NL2Obj() {
+ public NL2Obj(String path) {
prevOp = new ObjectControlOp();
prevOp.setOperatorIf("");
prevOp.setOperatorFor("");
+ parser = ParserChunker2MatcherProcessor.getInstance(path);
}
+
+ public NL2Obj() {
+ prevOp = new ObjectControlOp();
+ prevOp.setOperatorIf("");
+ prevOp.setOperatorFor("");
+ parser = ParserChunker2MatcherProcessor.getInstance();
+ }
public static String[] epistemicStatesList = new String[] {
"select", "verify", "find", "start", "stop", "go", "check"
@@ -268,6 +277,9 @@ public class NL2Obj {
public static void main(String[] args){
+
+ String cDir = new File(".").getAbsolutePath();
+
String[] text = new String[]{
"Randomly select a pixel at an image.",
"Find a convex area this pixel belongs, so that all pixels are less than 128", //area->REGION
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java Mon Jan 6 17:48:30 2014
@@ -25,120 +25,129 @@ import opennlp.tools.textsimilarity.chun
public class NL2ObjCreateAssign extends NL2Obj {
- private boolean classBeingDefined = false;
- public static String[] declarationStatesList = new String[] {
- "create", "assign", "set",
- };
-
- public static String[] dataTypesList = new String[] {
- "text", "double", "array",
- };
-
- public static String[] arrayElementList = new String[] {
- "first", "second", "third", "fourth"
- };
-
- public static String[] arrayElementListInsdex = new String[] {
- "0", "1", "2", "3"
- };
-
-
- @Override
- public ObjectPhraseListForSentence convertSentenceToControlObjectPhrase(String sentence){
- String expression = null;
- if (sentence.indexOf(":")>-1){
- expression = sentence.split(":")[1];
- sentence = sentence.split(":")[0]+".";
- }
-
-
- List<ObjectPhrase> oPhrases = new ArrayList<ObjectPhrase>();
- parser = ParserChunker2MatcherProcessor.getInstance();
- List<List<ParseTreeChunk>> lingPhrases =
- parser.formGroupedPhrasesFromChunksForSentence(sentence);
-
- ObjectControlOp op = extractControlPart(lingPhrases, prevOp);
- prevOp = op;
-
- //start with verb phrases
- List<ParseTreeChunk> actionWithObject = lingPhrases.get(1);
- actionWithObject.addAll( lingPhrases.get(4));
-
- System.out.println(" === "+actionWithObject);
-
- for(ParseTreeChunk verbChunk: actionWithObject){
- List<String> lems = verbChunk.getLemmas();
- String declarativeAction = verbChunk.getLemmas().get(0).toLowerCase();
- if (declarativeAction.equals("define")){
- if (verbChunk.getLemmas().get(1).toLowerCase().equals("class") ||
- verbChunk.getLemmas().get(2).toLowerCase().equals("class")){
- // new class
- String className = verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase();
- className = className.substring(0, 1).toUpperCase()+className.substring(1, className.length());
- op.setOperatorIf("class "+className + "{");
- op.setOperatorFor("{");
- classBeingDefined = true;
- break;
- }
- String dataType = verbChunk.getLemmas().get(1).toLowerCase();
-
- if (classBeingDefined && Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
- op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
- classBeingDefined = true;
- break;
- }
- if (Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
- op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
- classBeingDefined = true;
- break;
- }
- } else if (declarativeAction.equals("create")){
-
- // now substituting array
- if (verbChunk.getLemmas().get(1).toLowerCase().equals("array")){
-
- if(lems.contains("class")){
- int indClass = lems.indexOf("class");
- int numElements = lems.indexOf("elements");
- if (numElements<0)
- numElements = lems.indexOf("objects");
- if (numElements<0)
- numElements = lems.indexOf("members");
- String arraySize = lems.get(numElements-1);
- op.setOperatorFor(lems.get(indClass+1)+"[] "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase()
- +" = new "+lems.get(indClass+1)+"["+arraySize+"]");
- classBeingDefined = false;
- break;
- }
- }
- } else if (declarativeAction.equals("assign")){
- int numElements = lems.indexOf("element");
- if (numElements<0)
- numElements = lems.indexOf("object");
- if (numElements<0)
- numElements = lems.indexOf("member");
- if (Arrays.asList(arrayElementList).contains(lems.get(numElements-1))){
- int arrIndex = Arrays.asList(arrayElementList).indexOf(lems.get(numElements-1));
- String indexValue = arrayElementListInsdex[arrIndex];
-
- String arrayName = lems.get(lems.size()-1);
- if (expression!=null)
- op.setOperatorFor(arrayName+"["+indexValue+"]."+ expression);
- break;
- }
- } else if (declarativeAction.equals("set")){
- int indQuantifier = lems.indexOf("all");
- if (indQuantifier>-1 &&
- (lems.get(indQuantifier+1).equals("elements") || lems.get(indQuantifier+1).equals("members") )){
-
- String arrayName = lems.get(lems.size()-1);
- if (expression!=null)
- op.setOperatorFor("for(int i=0; i<"+ arrayName+".size(); i++) "+
- arrayName+"[i]."+ expression);
- break;
- }
- }
- /*
+ private boolean classBeingDefined = false;
+ public static String[] declarationStatesList = new String[] {
+ "create", "assign", "set",
+ };
+
+ public static String[] dataTypesList = new String[] {
+ "text", "double", "array",
+ };
+
+ public static String[] arrayElementList = new String[] {
+ "first", "second", "third", "fourth"
+ };
+
+ public static String[] arrayElementListInsdex = new String[] {
+ "0", "1", "2", "3"
+ };
+
+
+
+ public NL2ObjCreateAssign() {
+ super();
+ }
+
+ public NL2ObjCreateAssign(String path) {
+ super(path);
+ }
+
+ @Override
+ public ObjectPhraseListForSentence convertSentenceToControlObjectPhrase(String sentence){
+ String expression = null;
+ if (sentence.indexOf(":")>-1){
+ expression = sentence.split(":")[1];
+ sentence = sentence.split(":")[0]+".";
+ }
+
+
+ List<ObjectPhrase> oPhrases = new ArrayList<ObjectPhrase>();
+ parser = ParserChunker2MatcherProcessor.getInstance();
+ List<List<ParseTreeChunk>> lingPhrases =
+ parser.formGroupedPhrasesFromChunksForSentence(sentence);
+
+ ObjectControlOp op = extractControlPart(lingPhrases, prevOp);
+ prevOp = op;
+
+ //start with verb phrases
+ List<ParseTreeChunk> actionWithObject = lingPhrases.get(1);
+ actionWithObject.addAll( lingPhrases.get(4));
+
+ System.out.println(" === "+actionWithObject);
+
+ for(ParseTreeChunk verbChunk: actionWithObject){
+ List<String> lems = verbChunk.getLemmas();
+ String declarativeAction = verbChunk.getLemmas().get(0).toLowerCase();
+ if (declarativeAction.equals("define")){
+ if (verbChunk.getLemmas().get(1).toLowerCase().equals("class") ||
+ verbChunk.getLemmas().get(2).toLowerCase().equals("class")){
+ // new class
+ String className = verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase();
+ className = className.substring(0, 1).toUpperCase()+className.substring(1, className.length());
+ op.setOperatorIf("class "+className + "{");
+ op.setOperatorFor("{");
+ classBeingDefined = true;
+ break;
+ }
+ String dataType = verbChunk.getLemmas().get(1).toLowerCase();
+
+ if (classBeingDefined && Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
+ op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
+ classBeingDefined = true;
+ break;
+ }
+ if (Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
+ op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
+ classBeingDefined = true;
+ break;
+ }
+ } else if (declarativeAction.equals("create")){
+
+ // now substituting array
+ if (verbChunk.getLemmas().get(1).toLowerCase().equals("array")){
+
+ if(lems.contains("class")){
+ int indClass = lems.indexOf("class");
+ int numElements = lems.indexOf("elements");
+ if (numElements<0)
+ numElements = lems.indexOf("objects");
+ if (numElements<0)
+ numElements = lems.indexOf("members");
+ String arraySize = lems.get(numElements-1);
+ op.setOperatorFor(lems.get(indClass+1)+"[] "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase()
+ +" = new "+lems.get(indClass+1)+"["+arraySize+"]");
+ classBeingDefined = false;
+ break;
+ }
+ }
+ } else if (declarativeAction.equals("assign")){
+ int numElements = lems.indexOf("element");
+ if (numElements<0)
+ numElements = lems.indexOf("object");
+ if (numElements<0)
+ numElements = lems.indexOf("member");
+ if (Arrays.asList(arrayElementList).contains(lems.get(numElements-1))){
+ int arrIndex = Arrays.asList(arrayElementList).indexOf(lems.get(numElements-1));
+ String indexValue = arrayElementListInsdex[arrIndex];
+
+ String arrayName = lems.get(lems.size()-1);
+ if (expression!=null)
+ op.setOperatorFor(arrayName+"["+indexValue+"]."+ expression);
+ break;
+ }
+ } else if (declarativeAction.equals("set")){
+ int indQuantifier = lems.indexOf("all");
+ if (indQuantifier>-1 &&
+ (lems.get(indQuantifier+1).equals("elements") || lems.get(indQuantifier+1).equals("members") )){
+
+ String arrayName = lems.get(lems.size()-1);
+ if (expression!=null)
+ op.setOperatorFor("for(int i=0; i<"+ arrayName+".size(); i++) "+
+ arrayName+"[i]."+ expression);
+ break;
+ }
+ }
+ /*
else {
List<String> paramValues = verbChunk.getLemmas(), paramPOSs = verbChunk.getPOSs();
@@ -205,40 +214,40 @@ public class NL2ObjCreateAssign extends
oPhrases.add(oPhrase);
} */
- }
+ }
- ObjectPhraseListForSentence oplfs = new ObjectPhraseListForSentence( oPhrases, op);
- oplfs.cleanMethodNamesIsAre();
- oplfs.substituteNullObjectIntoEmptyArg();
-
- return oplfs;
- }
-
- public static void main(String[] args){
-
- String[] text = new String[]{
- "Define a class and name it Employee. ",
- "Define text attribute and name it m_name. ",
- "Define double attribute and name it m_salary.",
- "Create array of objects of class Employee for 10 elements, name the object as workforce.",
- "Assign the first element in array workforce: m_name=\"Boss\"",
- "Assign the second element in array workforce: m_name=\"His wife\"",
- // "Comment: We just started our small business company and expect to hire 8 more people soon.",
- "Set for all elements in array workforce: m_salary=0 ",
- "Print the list of all m_name attributes for workforce."
-
- };
-
- NL2Obj compiler = new NL2ObjCreateAssign();
- for(String sent:text){
- ObjectPhraseListForSentence opls=null;
- try {
- opls = compiler.convertSentenceToControlObjectPhrase(sent);
- } catch (Exception e) {
- e.printStackTrace();
- }
- System.out.println(sent+"\n"+opls+"\n");
- }
+ ObjectPhraseListForSentence oplfs = new ObjectPhraseListForSentence( oPhrases, op);
+ oplfs.cleanMethodNamesIsAre();
+ oplfs.substituteNullObjectIntoEmptyArg();
+
+ return oplfs;
+ }
+
+ public static void main(String[] args){
+
+ String[] text = new String[]{
+ "Define a class and name it Employee. ",
+ "Define text attribute and name it m_name. ",
+ "Define double attribute and name it m_salary.",
+ "Create array of objects of class Employee for 10 elements, name the object as workforce.",
+ "Assign the first element in array workforce: m_name=\"Boss\"",
+ "Assign the second element in array workforce: m_name=\"His wife\"",
+ // "Comment: We just started our small business company and expect to hire 8 more people soon.",
+ "Set for all elements in array workforce: m_salary=0 ",
+ "Print the list of all m_name attributes for workforce."
+
+ };
+
+ NL2Obj compiler = new NL2ObjCreateAssign();
+ for(String sent:text){
+ ObjectPhraseListForSentence opls=null;
+ try {
+ opls = compiler.convertSentenceToControlObjectPhrase(sent);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ System.out.println(sent+"\n"+opls+"\n");
+ }
- }
+ }
}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,54 @@
+package opennlp.tools.parse_thicket;
+
+public class ArcType{
+ private String type; // rst
+ private String subtype; // rst-explain
+ private Integer type_id;
+ private Integer subtype_id;
+
+ public ArcType(String type, // rst
+ String subtype, // rst-explain
+ Integer type_id,
+ Integer subtype_id){
+ this.type = type; // rst
+ this.subtype = subtype; // rst-explain
+ this.type_id= type_id;
+ this.subtype_id = subtype_id;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getSubtype() {
+ return subtype;
+ }
+
+ public void setSubtype(String subtype) {
+ this.subtype = subtype;
+ }
+
+ public Integer getType_id() {
+ return type_id;
+ }
+
+ public void setType_id(Integer type_id) {
+ this.type_id = type_id;
+ }
+
+ public Integer getSubtype_id() {
+ return subtype_id;
+ }
+
+ public void setSubtype_id(Integer subtype_id) {
+ this.subtype_id = subtype_id;
+ }
+
+ public String toString(){
+ return type+":"+subtype;
+ }
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,12 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.List;
+
+public interface IGeneralizer<T> {
+ /* All objects such as words, ParseTreeNodes, Phrases, Communicative actions etc. are subject to
+ * generalization, so should implement this interface
+ *
+ * In this project Everything is subject to generalization, and returns a list of generic objects
+ */
+ public List<T> generalize(Object o1, Object o2);
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,89 @@
+package opennlp.tools.parse_thicket;
+
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.stanford.nlp.trees.LabeledScoredTreeNode;
+import edu.stanford.nlp.trees.SimpleTree;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeFactory;
+
+
+
+public class PTTree extends SimpleTree {
+
+ public PTTree(){
+ super();
+ }
+
+ public PTTree(Tree t){
+ super();
+ }
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public PTTree[] children() {
+ return children();
+ }
+
+ @Override
+ public TreeFactory treeFactory() {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ public void doNavigate(){
+ List<LabeledScoredTreeNode> phrases = new ArrayList<LabeledScoredTreeNode>();
+ navigate(0, false, false, false, true, true, phrases);
+ }
+
+ private static void navigateChildren(PTTree[] trChildren, int indent, boolean parentLabelNull, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+ boolean firstSibling = true;
+ boolean leftSibIsPreTerm = true; // counts as true at beginning
+ for (PTTree currentTree : trChildren) {
+ currentTree.navigate(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, onlyLabelValue, phrases);
+ leftSibIsPreTerm = currentTree.isPreTerminal();
+ // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting
+ if (currentTree.value() != null && currentTree.value().startsWith("CC")) {
+ leftSibIsPreTerm = false;
+ }
+ firstSibling = false;
+ }
+ }
+
+ /**
+ * navigate parse tree
+ */
+ private void navigate(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+ // the condition for staying on the same line in Penn Treebank
+ boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));
+ if (suppressIndent) {
+ //pw.print(" ");
+ // pw.flush();
+ } else {
+ if (!topLevel) {
+ //pw.println();
+ }
+ for (int i = 0; i < indent; i++) {
+ //pw.print(" ");
+ // pw.flush();
+ }
+ }
+ if (isLeaf() || isPreTerminal()) {
+ String terminalString = toStringBuilder(new StringBuilder(), onlyLabelValue).toString();
+ //pw.print(terminalString);
+ //pw.flush();
+ return;
+ }
+ //pw.print("(");
+ String nodeString = onlyLabelValue ? value() : nodeString();
+ //pw.print(nodeString);
+ // pw.flush();
+ boolean parentIsNull = label() == null || label().value() == null;
+ navigateChildren(children(), indent + 1, parentIsNull, true, phrases);
+ //pw.print(")");
+
+ }
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket;
+
+import java.util.Comparator;
+
+/**
+ * Generic pair class for holding two objects. Often used as return object.
+ *
+ * @author Albert-Jan de Vries
+ *
+ * @param <T1>
+ * @param <T2>
+ */
+public class Pair<T1, T2> {
+ private T1 first;
+
+ private T2 second;
+
+ public Pair() {
+
+ }
+
+ public Pair(T1 first, T2 second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public T1 getFirst() {
+ return first;
+ }
+
+ public void setFirst(T1 first) {
+ this.first = first;
+ }
+
+ public T2 getSecond() {
+ return second;
+ }
+
+ public void setSecond(T2 second) {
+ this.second = second;
+ }
+
+ public class PairComparable implements Comparator<Pair<T1, T2>> {
+ // @Override
+ public int compare(Pair o1, Pair o2) {
+ int b = -2;
+ if ( o1.second instanceof Float && o2.second instanceof Float){
+
+ b = (((Float)o1.second > (Float)o2.second) ? -1
+ : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+ }
+ return b;
+ }
+ }
+ public String toString(){
+ return this.first.toString()+" "+this.second.toString();
+ }
+
+}
+
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,191 @@
+package opennlp.tools.parse_thicket;
+
+import java.io.*;
+import java.util.*;
+
+import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;
+
+import edu.stanford.nlp.dcoref.CorefChain;
+import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
+import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
+import edu.stanford.nlp.ling.*;
+import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.pipeline.*;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.util.*;
+
+public class ParseCorefsBuilder {
+ protected static ParseCorefsBuilder instance;
+ private Annotation annotation;
+ StanfordCoreNLP pipeline;
+ CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();
+
+ /**
+ * singleton method of instantiating the processor
+ *
+ * @return the instance
+ */
+ public synchronized static ParseCorefsBuilder getInstance() {
+ if (instance == null)
+ instance = new ParseCorefsBuilder();
+
+ return instance;
+ }
+
+ ParseCorefsBuilder(){
+ Properties props = new Properties();
+ props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
+ pipeline = new StanfordCoreNLP(props);
+ }
+
+ public ParseThicket buildParseThicket(String text){
+ List<Tree> ptTrees = new ArrayList<Tree>();
+ // all numbering from 1, not 0
+ List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+ List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
+
+ annotation = new Annotation(text);
+ try {
+ pipeline.annotate(annotation);
+ List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
+ if (sentences != null && sentences.size() > 0)
+ for(CoreMap sentence: sentences){
+ List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
+
+ // traversing the words in the current sentence
+ // a CoreLabel is a CoreMap with additional token-specific methods
+ Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
+ List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
+ int count=1;
+ for (CoreLabel token: coreLabelList ) {
+ // this is the text of the token
+ String lemma = token.get(TextAnnotation.class);
+ // this is the POS tag of the token
+ String pos = token.get(PartOfSpeechAnnotation.class);
+ // this is the NER label of the token
+ String ne = token.get(NamedEntityTagAnnotation.class);
+ nodes.add(new ParseTreeNode(lemma, pos, ne, count));
+ count++;
+ }
+ nodesThicket.add(nodes);
+ Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
+ ptTrees.add(tree);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+
+ // now coreferences
+ Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
+ List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
+ for(CorefChain c: chains){
+ //System.out.println(c);
+ List<CorefMention> mentions = c.getMentionsInTextualOrder();
+ //System.out.println(mentions);
+ if (mentions.size()>1)
+ for(int i=0; i<mentions.size(); i++){
+ for(int j=i+1; j<mentions.size(); j++){
+ CorefMention mi = mentions.get(i), mj=mentions.get(j);
+
+
+ int niSentence = mi.position.get(0);
+ int niWord = mi.startIndex;
+ int njSentence = mj.position.get(0);
+ int njWord = mj.startIndex;
+
+ ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);
+
+ WordWordInterSentenceRelationArc arc =
+ new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord),
+ new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,
+ arcType);
+ arcs.add(arc);
+
+ /*
+ System.out.println("animacy = "+m.animacy);
+ System.out.println("mention span = "+m.mentionSpan);
+ System.out.println(" id = "+m.mentionID);
+ System.out.println(" position = "+m.position);
+ System.out.println(" start index = "+m.startIndex);
+ System.out.println(" end index = "+m.endIndex);
+ System.out.println(" mentionType = "+m.mentionType);
+ System.out.println(" number = = "+m.number);
+ */
+ }
+ }
+
+
+ }
+ List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
+
+ ParseThicket result = new ParseThicket(ptTrees, arcs);
+ result.setNodesThicket(nodesThicket);
+ return result;
+ }
+
+ private List<WordWordInterSentenceRelationArc> buildCAarcs(
+ List<List<ParseTreeNode>> nodesThicket) {
+ List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+
+ for(int sentI=0; sentI<nodesThicket.size(); sentI++){
+ for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
+ List<ParseTreeNode> sentenceI = nodesThicket.get(sentI),
+ sentenceJ = nodesThicket.get(sentJ);
+ Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
+ Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
+ int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
+ int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
+ if (caI==null || caJ==null)
+ continue;
+ Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);
+
+ ArcType arcType = new ArcType("ca",
+ caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
+ WordWordInterSentenceRelationArc arc =
+ new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1),
+ new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(),
+ arcType);
+ arcs.add(arc);
+
+ }
+ }
+
+ return arcs;
+ }
+
+ private String printNumArray(Integer[] arr){
+ StringBuffer buf = new StringBuffer();
+ for(Integer i: arr){
+ buf.append(Integer.toString(i)+ " ");
+ }
+ return buf.toString();
+ }
+
+public static void main(String[] args) throws IOException {
+ ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
+ ParseThicket th = builder.buildParseThicket("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
+ "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
+ "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
+ "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");
+ //GraphFromPTreeBuilder gbuilder = new GraphFromPTreeBuilder();
+ //gbuilder.buildGraphFromPT(th);
+
+ }
+
+}
+
+/*
+ * [<sent=1-word=1..Iran> ===> <sent=3-word=9..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=1..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=4..its>, <sent=1-word=1..Iran> ===> <sent=4-word=17..it>, <sent=3-word=9..Iran> ===> <sent=4-word=1..Iran>, <sent=3-word=9..Iran> ===> <sent=4-word=4..its>, <sent=3-word=9..Iran> ===> <sent=4-word=17..it>, <sent=4-word=1..Iran> ===> <sent=4-word=4..its>, <sent=4-word=1..Iran> ===> <sent=4-word=17..it>, <sent=4-word=4..its> ===> <sent=4-word=17..it>, <sent=1-word=6..UN> ===> <sent=2-word=1..UN>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=10..its>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=13..its>, <sent=1-word=10..its> ===> <sent=1-word=13..its>, <sent=1-word=16..nuclear weapons> ===> <sent=3-word=14..nuclear weapons>, <sent=2-word=1..UN nuclear watchdog> ===> <sent=2-word=11..its>]
+
+[[[1]Iran:NNP>LOCATION, [2]refuses:VBZ>O, [3]to:TO>O, [4]accept:VB>O, [5]the:DT>O, [6]UN:NNP>ORGANIZATION, [7]proposal:NN>O, [8]to:TO>O, [9]end:VB>O, [10]its:PRP$>O, [11]dispute:NN>O, [12]over:IN>O, [13]its:PRP$>O, [14]work:NN>O, [15]on:IN>O, [16]nuclear:JJ>O, [17]weapons:NNS>O, [18].:.>O],
+
+[[1]UN:NNP>ORGANIZATION, [2]nuclear:JJ>O, [3]watchdog:NN>O, [4]passes:VBZ>O, [5]a:DT>O, [6]resolution:NN>O, [7]condemning:VBG>O, [8]Iran:NNP>LOCATION, [9]for:IN>O, [10]developing:VBG>O, [11]its:PRP$>O, [12]second:JJ>ORDINAL, [13]uranium:NN>O, [14]enrichment:NN>O, [15]site:NN>O, [16]in:IN>O, [17]secret:NN>O, [18].:.>O],
+
+[[1]A:DT>O, [2]recent:JJ>O, [3]IAEA:NNP>ORGANIZATION, [4]report:NN>O, [5]presented:VBD>O, [6]diagrams:NNS>O, [7]that:WDT>O, [8]suggested:VBD>O, [9]Iran:NNP>LOCATION, [10]was:VBD>O, [11]secretly:RB>O, [12]working:VBG>O, [13]on:IN>O, [14]nuclear:JJ>O, [15]weapons:NNS>O, [16].:.>O],
+
+[[1]Iran:NNP>LOCATION, [2]envoy:NN>O, [3]says:VBZ>O, [4]its:PRP$>O, [5]nuclear:JJ>O, [6]development:NN>O, [7]is:VBZ>O, [8]for:IN>O, [9]peaceful:JJ>O, [10]purpose:NN>O, [11],:,>O, [12]and:CC>O, [13]the:DT>O, [14]material:NN>O, [15]evidence:NN>O, [16]against:IN>O, [17]it:PRP>O, [18]has:VBZ>O, [19]been:VBN>O, [20]fabricated:VBN>O, [21]by:IN>O, [22]the:DT>O, [23]US:NNP>LOCATION, [24].:.>O]]
+*/
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,59 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.List;
+
+import edu.stanford.nlp.trees.Tree;
+
+public class ParseThicket {
+ // parse trees
+ private List<Tree> sentenceTrees;
+ // there should be an arc for each sentence
+ private List<WordWordInterSentenceRelationArc> arcs;
+ // lists of nodes for each sentence
+ // then list for all sentences
+ private List<List<ParseTreeNode>> sentenceNodes;
+
+ public List<Tree> getSentences() {
+ return sentenceTrees;
+ }
+
+ public void setSentences(List<Tree> sentences) {
+ this.sentenceTrees = sentences;
+ }
+
+ public List<WordWordInterSentenceRelationArc> getArcs() {
+ return arcs;
+ }
+
+ public void setArcs(List<WordWordInterSentenceRelationArc> arcs) {
+ this.arcs = arcs;
+ }
+
+ public List<List<ParseTreeNode>> getNodesThicket() {
+ return sentenceNodes;
+ }
+
+ public void setNodesThicket(List<List<ParseTreeNode>> nodesThicket) {
+ this.sentenceNodes = nodesThicket;
+ }
+
+ public ParseThicket(String paragraph){
+ ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
+ ParseThicket res = builder.buildParseThicket(paragraph);
+ this.sentenceTrees= res.sentenceTrees;
+ this.arcs = res.arcs;
+ }
+
+ public ParseThicket(List<Tree> ptTrees,
+ List<WordWordInterSentenceRelationArc> barcs) {
+ this.sentenceTrees= ptTrees;
+ this.arcs = barcs;
+ }
+
+ public String toString(){
+ return this.sentenceTrees+"\n"+this.arcs;
+ }
+
+
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,153 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
+ String word;
+ // this is the POS tag of the token
+ String pos;
+ // this is the NER label of the token
+ String ne;
+ Integer id;
+ //PhraseType
+ String phraseType;
+
+ public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
+ private PhraseType(final String text) {
+ this.text = text;
+ }
+ private final String text;
+
+ }
+
+ public ParseTreeNode(String word, String pos, String ne, Integer id) {
+ super();
+ this.word = word;
+ this.pos = pos;
+ this.ne = ne;
+ this.id = id;
+ }
+
+ public ParseTreeNode(String word, String pos) {
+ super();
+ this.word = word;
+ this.pos = pos;
+ this.ne = ne;
+ this.id = id;
+ }
+
+ public String getPhraseType() {
+ return phraseType;
+ }
+ public void setPhraseType(String pt) {
+ this.phraseType=pt;
+ }
+ public String getWord() {
+ return word;
+ }
+ public void setWord(String word) {
+ this.word = word;
+ }
+ public String getPos() {
+ return pos;
+ }
+ public void setPos(String pos) {
+ this.pos = pos;
+ }
+ public String getNe() {
+ return ne;
+ }
+ public void setNe(String ne) {
+ this.ne = ne;
+ }
+ public Integer getId() {
+ return id;
+ }
+ public void setId(Integer id) {
+ this.id = id;
+ }
+
+ public String toString(){
+ StringBuffer buf = new StringBuffer();
+ if (id!=null)
+ buf.append("<"+id+">");
+ if(phraseType!=null)
+ buf.append(phraseType);
+ if(word!=null)
+ buf.append("'"+word+"'");
+ if (pos!=null)
+ buf.append(":"+pos);
+ return buf.toString();
+ }
+
+ @Override
+ public List<ParseTreeNode> generalize(Object o1, Object o2) {
+ List<ParseTreeNode> result = new ArrayList<ParseTreeNode>();
+
+ ParseTreeNode w1 = (ParseTreeNode) o1;
+ ParseTreeNode w2 = (ParseTreeNode) o2;
+ String posGen = generalizePOS(w1.pos, w2.pos);
+ if (posGen ==null)
+ return result;
+ ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
+ posGen, "O", -1);
+ result.add(newNode);
+ return result;
+ }
+
+ public String generalizeWord(String lemma1, String lemma2){
+ if (lemma1.equals(lemma2))
+ return lemma1;
+ if (lemma1.equals("*"))
+ return "*";
+ if (lemma2.equals("*"))
+ return "*";
+ //TODO
+ return "*";
+
+ }
+
+ public String generalizePOS(String pos1, String pos2) {
+ if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")
+ && pos1.equals("NP"))) {
+ return "NN";
+ }
+ if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")
+ && pos1.equals("NN"))) {
+ return "NN";
+ }
+
+ if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")
+ && pos1.equals("ADJP"))) {
+ return "NN";
+ }
+ if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
+ && pos2.equals("IN"))) {
+ return "IN";
+ }
+ // VBx vs VBx = VB (does not matter which form for verb)
+ if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
+ return "VB";
+ }
+
+ // ABx vs ABy always gives AB
+ if (pos1.equalsIgnoreCase(pos2)) {
+ return pos1;
+ }
+ if (pos1.length() > 2) {
+ pos1 = pos1.substring(0, 2);
+ }
+
+ if (pos2.length() > 2) {
+ pos2 = pos2.substring(0, 2);
+ }
+ if (pos1.equalsIgnoreCase(pos2)) {
+ return pos1 + "*";
+ }
+ return null;
+ }
+
+
+};
+
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,49 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.Comparator;
+
+
+public class Triple<T1, T2, T3> {
+ private T1 first;
+
+ private T2 second;
+
+ private T3 third;
+
+ public Triple() {
+
+ }
+
+ public T1 getFirst() {
+ return first;
+ }
+
+ public void setFirst(T1 first) {
+ this.first = first;
+ }
+
+ public T2 getSecond() {
+ return second;
+ }
+
+ public void setSecond(T2 second) {
+ this.second = second;
+ }
+
+ public Triple(T1 first, T2 second, T3 third) {
+ super();
+ this.first = first;
+ this.second = second;
+ this.third = third;
+ }
+
+ public T3 getThird() {
+ return third;
+ }
+
+ public void setThird(T3 third) {
+ this.third = third;
+ }
+
+
+ }
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,68 @@
+package opennlp.tools.parse_thicket;
+
+public class WordWordInterSentenceRelationArc {
+
+
+ Pair<Integer, Integer> codeFrom;
+ Pair<Integer, Integer> codeTo;
+ String lemmaFrom;
+ String lemmaTo;
+ ArcType arcType;
+
+ public Pair<Integer, Integer> getCodeFrom() {
+ return codeFrom;
+ }
+
+ public void setCodeFrom(Pair<Integer, Integer> codeFrom) {
+ this.codeFrom = codeFrom;
+ }
+
+ public Pair<Integer, Integer> getCodeTo() {
+ return codeTo;
+ }
+
+ public void setCodeTo(Pair<Integer, Integer> codeTo) {
+ this.codeTo = codeTo;
+ }
+
+ public String getLemmaFrom() {
+ return lemmaFrom;
+ }
+
+ public void setLemmaFrom(String lemmaFrom) {
+ this.lemmaFrom = lemmaFrom;
+ }
+
+ public String getLemmaTo() {
+ return lemmaTo;
+ }
+
+ public void setLemmaTo(String lemmaTo) {
+ this.lemmaTo = lemmaTo;
+ }
+
+ public ArcType getArcType() {
+ return arcType;
+ }
+
+ public void setArcType(ArcType arcType) {
+ this.arcType = arcType;
+ }
+
+ public WordWordInterSentenceRelationArc(
+ Pair<Integer, Integer> codeFrom, Pair<Integer, Integer> codeTo,
+ String lemmaFrom, String lemmaTo, ArcType arcType) {
+ super();
+ this.codeFrom = codeFrom;
+ this.codeTo = codeTo;
+ this.lemmaFrom = lemmaFrom;
+ this.lemmaTo = lemmaTo;
+ this.arcType = arcType;
+ }
+
+ public String toString(){
+ return "<sent="+codeFrom.getFirst()+"-word="+codeFrom.getSecond()+".."+lemmaFrom+"> ===> "+
+ "<sent="+codeTo.getFirst()+"-word="+codeTo.getSecond()+".."+lemmaTo+">";
+ }
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+
+import org.apache.commons.lang.StringUtils;
+import org.json.JSONArray;
+import org.json.JSONObject;
+
+
+public class BingQueryRunnerMultipageSearchResults extends BingQueryRunner {
+
+ private static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+ private static final Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.BingQueryRunnerMultipageSearchResults");
+ private AzureSearchWebQuery aq = new AzureSearchWebQuery();
+
+ public List<HitBase> runSearch(String query, int nRes, boolean bHighRank) {
+ aq.setAppid(BING_KEY);
+ aq.setQuery(query);
+ aq.doQuery();
+ if (!bHighRank)
+ aq.setPage(5);
+ aq.setPerPage(nRes);
+
+ List<HitBase> results = new ArrayList<HitBase> ();
+ AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+
+ for (AzureSearchWebResult anr : ars){
+ HitBase h = new HitBase();
+ h.setAbstractText(anr.getDescription());
+ h.setTitle(anr.getTitle());
+ h.setUrl(anr.getUrl());
+ results.add(h);
+ }
+ return results;
+ }
+
+
+
+
+}