You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC

svn commit: r1555944 [3/11] - in /opennlp/sandbox/opennlp-similarity/src: main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/ main/java/opennlp/tools/apps/contentgen/multithreaded/ main/java/opennlp/tools/apps/relevanceVocabs/ main/j...

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+public class FeatureSpaceCoverageProcessor {
+
+	public Map<String, Integer> paramMap = new HashMap<String, Integer>();
+	public String[] header; 
+	String[] attributes;
+
+	public FeatureSpaceCoverageProcessor (){
+		
+	}
+
+	public void initParamMap(String[] attributes, String[] header){
+		this.header = header;
+		this.attributes = attributes;
+		for(int m=0; m<header.length; m++){
+			paramMap.put(header[m], m);
+		}
+	}
+
+
+	// distance between array and array
+	public Float calcDistance(String[] seed, String[] candidate) throws Exception {
+		if (paramMap.isEmpty())
+			throw new Exception("paramMap.isEmpty()");
+
+		Float score = 0f;
+		int p1 = paramMap.get("First Level Category");	
+		int p2 = paramMap.get("Second Level Category");
+		if (seed[p1].equals(candidate[p1])) {
+			if (seed[p2].equals(candidate[p2]))
+				score = score+0.0000001f;
+			else
+				score = score+0.01f;			
+		} else return 100000f;
+
+		try {
+			int p3 = paramMap.get("Latitude");	
+			int p4 = paramMap.get("Longitude");
+			Double latDiff = Math.abs(Double.parseDouble(seed[p3]) - Double.parseDouble(candidate[p3]));
+			Double longDiff = Math.abs(Double.parseDouble(seed[p4]) - Double.parseDouble(candidate[p4]));
+			if (latDiff>1 || longDiff>1)
+				return 1000000f;
+			else 
+				score+= latDiff.floatValue()/100.0f + longDiff.floatValue()/100.0f;
+		} catch (Exception e) {
+			return 1000000f;
+		}
+
+
+		return score;
+	}
+
+	// distance between matrix and array
+	public Float calcDistance(String[][] seed, String[] candidate) throws Exception {
+		if (paramMap.isEmpty())
+			throw new Exception("paramMap.isEmpty()");
+
+		Float score = 0f, catScore = 10000f, currCatScore=10000000f;
+
+		int p1 = paramMap.get("First Level Category");	
+		int p2 = paramMap.get("Second Level Category");
+		for(int v=0; v<seed[0].length; v++){
+			if (seed[p1][v].equals(candidate[p1])) {
+				if (seed[p2][v].equals(candidate[p2]))
+					currCatScore = 0.0000001f;
+				else
+					currCatScore = 0.01f;			
+			} 
+			if ( catScore >  currCatScore) // if found closer, update
+				catScore =  currCatScore;
+		}
+		score = catScore;
+		if (score > 1000000f)
+			return 10000000f;
+
+		Float latLongScore = 100000f, currLatLongScore = 10000000f;
+		for(int v=0; v<seed[0].length; v++){
+			try {
+				int p3 = paramMap.get("Latitude");	
+				int p4 = paramMap.get("Longitude");
+				if (seed[p3][v].equals("") || seed[p4][v].equals("") 
+						|| candidate[p3].equals("") ||  candidate[p4].equals(""))
+					continue;
+				Double latDiff = Math.abs(Double.parseDouble(seed[p3][v]) - Double.parseDouble(candidate[p3]));
+				Double longDiff = Math.abs(Double.parseDouble(seed[p4][v]) - Double.parseDouble(candidate[p4]));
+				if (!(latDiff>1 || longDiff>1))
+					currLatLongScore = latDiff.floatValue()/100.0f + longDiff.floatValue()/100.0f;
+			} catch (Exception e) {
+				//return 1000000f;
+			}
+			if (latLongScore > currLatLongScore)
+				latLongScore = currLatLongScore;
+
+		}	
+		if (latLongScore> 10000)
+			return 10000f;
+		score+=latLongScore;
+		return score;
+	}
+
+	public Integer getIdForAttributeName(String key){
+		Integer res = paramMap.get(key);
+		try {
+			res.toString();
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.out.println("wrong key"+key);
+		}
+		return res;
+
+	}
+
+	public String getAttribNameForId(Integer id){
+		return header[id];
+	}
+
+
+
+
+	public Map<String, String> computeIntersection(String[] line1,
+			String[] line2) {
+
+		Map<String, String> attr_value = new HashMap<String, String>();
+		for(String attr: attributes){
+			int attrIndex = getIdForAttributeName(attr);
+			String v1 = line1[attrIndex].toLowerCase().replace("\"", "").replace(",  ", ", ").replace(", ", ",");;
+			String v2 = line2[attrIndex].toLowerCase().replace("\"", "").replace(",  ", ", ").replace(", ", ",");;
+			String valArr1Str = StringUtils.substringBetween(v1, "{", "}");
+			String valArr2Str = StringUtils.substringBetween(v2, "{", "}");
+			if (valArr1Str==null || valArr2Str==null) { // we assume single value, not an array of values
+				if (v1.equals(v2)){
+					attr_value.put(attr, v1);
+				}
+			}
+			else {
+				valArr1Str = valArr1Str.replaceAll(", ", ",");
+				valArr2Str = valArr2Str.replaceAll(", ", ",");
+				String[] valArr1 = valArr1Str.split(",");
+				String[] valArr2 = valArr2Str.split(","); 
+				List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1));
+				List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2));
+				valList1.retainAll(valList2);
+				/* verification of coverage
+				valList1.retainAll(valList2);
+				
+				List<String> vl1 = new ArrayList<String>(Arrays.asList(valArr1));
+				valList1.retainAll(vl1); */
+				
+				if (!valList1.isEmpty()){
+					v1 = "{"+valList1.toString().replace("["," ").replace("]", " ").trim()+"}";
+					attr_value.put(attr, v1);
+				}
+
+			}		    		
+		}
+			return attr_value;
+	}
+
+
+		public boolean ruleCoversCase(Map<String, String> attr_value, String[] line){
+			boolean soFarCovers = true;		
+			for(String attr: attributes){
+				int attrIndex = getIdForAttributeName(attr);
+				String rule = attr_value.get(attr);
+				if (rule == null)
+					continue; // no constraint
+				rule = rule.toLowerCase().replace("\"", "").replace(",  ", ",").replace(", ", ",");
+				String vCase = line[attrIndex].toLowerCase().replace("\"", "").replace(",  ", ",").replace(", ", ",");
+				if (vCase==null){// rule for this attribute exists but case has no value
+					soFarCovers = false;
+					return false;
+				}
+				
+				String valArrCaseStr = StringUtils.substringBetween(vCase, "{", "}");
+				String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}");
+				if (valArrCaseStr==null || valArrRuleStr==null) { // we assume single value, not an array of values
+					if (!vCase.equals(rule)){
+						soFarCovers = false;
+						return false;
+					}
+				}
+				else {
+					String[] valArrCase = valArrCaseStr.split(",");
+					String[] valArrRule = valArrRuleStr.split(","); 
+					List<String> valListCase = new ArrayList<String>(Arrays.asList(valArrCase));
+					List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule));
+					
+					int ruleSize = valListRule.size();
+					//System.out.println(valListRule);
+					//System.out.println(valListCase);
+					
+					// rule members are subset of case
+					valListRule.retainAll(valListCase);
+					
+					//System.out.println(valListRule);
+					
+					if (ruleSize != valListRule.size()){
+						soFarCovers = false;
+						return false;
+					}
+					
+					
+					
+				}		    		
+			}
+			return  soFarCovers;
+		}
+		
+		public boolean ruleCoversRule(Map<String, String> attr_value, Map<String, String> line){
+			boolean soFarCovers = true;		
+			for(String attr: attributes){
+				int attrIndex = getIdForAttributeName(attr);
+				String rule = attr_value.get(attr);
+				if (rule == null)
+					continue; // no constraint
+				
+				String vRuleBeingCovered = line.get(attr);
+				if (vRuleBeingCovered==null){// rule for this attribute exists but RuleBeingCovered has no value
+					soFarCovers = false;
+					return false;
+				}
+				
+				String valArrRuleBeingCoveredStr = StringUtils.substringBetween(vRuleBeingCovered, "{", "}");
+				String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}");
+				if (valArrRuleBeingCoveredStr==null || valArrRuleStr==null) { // we assume single value, not an array of values
+					if (!vRuleBeingCovered.equals(rule)){
+						soFarCovers = false;
+						return false;
+					}
+				}
+				else {
+					String[] valArrRuleBeingCovered = valArrRuleBeingCoveredStr.split(",");
+					String[] valArrRule = valArrRuleStr.split(","); 
+					List<String> valListRuleBeingCovered = new ArrayList<String>(Arrays.asList(valArrRuleBeingCovered));
+					List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule));		
+					for(String r: valListRule){
+						if (!strListContainsMember(valListRuleBeingCovered, r)){
+							soFarCovers = false;
+							return false;
+						} 
+					}
+
+				}		    		
+			}
+			return  soFarCovers;
+		}
+
+		public Map<String, String> computeIntersection(
+				Map<String, String> rule1, Map<String, String> rule2) {
+			Map<String, String> attr_value = new HashMap<String, String>();
+			for(String attr: attributes){
+				int attrIndex = getIdForAttributeName(attr);
+				String v1 = rule1.get(attr);
+				String v2 = rule2.get(attr);
+				if (v1==null || v2==null)
+					continue;
+				String valArr1Str = StringUtils.substringBetween(v1, "{", "}");
+				String valArr2Str = StringUtils.substringBetween(v2, "{", "}");
+				if (valArr1Str==null || valArr2Str==null) { // we assume single value, not an array of values
+					if (v1.equals(v2)){
+						attr_value.put(attr, v1);
+					}
+				}
+				else {
+					valArr1Str = valArr1Str.replaceAll(", ", ",");
+					valArr2Str = valArr2Str.replaceAll(", ", ",");
+					String[] valArr1 = valArr1Str.split(",");
+					String[] valArr2 = valArr2Str.split(","); 
+					List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1));
+					List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2));
+					valList1.retainAll(valList2);
+					if (!valList1.isEmpty()){
+						v1 = "{"+valList1.toString().replace("["," ").replace("]", " ").trim()+"}";
+						attr_value.put(attr, v1);
+					}
+
+				}		    		
+			}
+				return attr_value;
+		}
+
+		private boolean strListContainsMember(List<String> valListCase, String r) {
+			boolean bContains = false;
+			for(String m: valListCase){
+				if (m.startsWith(r) || r.startsWith(m))
+					return true;
+				
+			}
+			return false;
+		}
+}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,361 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/*
+ * 
+ * The rule is in the form
+The report also shows how many positive cases are covered by this rule (should be 0) and how many negative cases 
+are covered by this rule (should be above 1)
+
+The rule
+{plugin_number=3, service_type=all, mime_type_number=11, review_status=pass}	0	192
+
+should be read as 
+
+plugin_number=3 & service_type=all & mime_type_number=11 & review_status=pass
+
+For a single-attribute, its value should be the one from this rule. For a multi-value attribute, the set of values in the case
+should INCLUDE the set of values from the rule.
+
+The rule checking that a case belongs to the negative set is a disjunction of all rules in the result file.
+
+input: two data files, one is negative set and another is positive set.
+in the argument, just the negative file needs to be specified:
+".../negativeSet1.csv", 
+then the system assumes that the filename for negative is obtained by replacing 'negative' with 'positive'
+".../positiveSet1.csv", 
+
+The set of attribute in analysis is hard coded
+
+
+ */
+public class IntersectionSetBuilder{
+	private FeatureSpaceCoverageProcessor distProcessorPos, distProcessorNeg;
+	private float percentageOfAllowedSetCover = 0.001f;
+	//The set of attribute in analysis is hard coded
+	String[] fieldsToAggr = new String[]{
+			"reason_code",	"risk_rating", "service_type", 	"device_match_result", 	"device_result", 	"http_referer", 	"device_id_reason_code",
+			"review_status", "tcp_os_sig_ttl", "tcp_connection_type",
+			"mime_type_number", "plugin_number", "http_connection_type", "device_last_event", "http_connection_type"
+
+
+	};
+	public IntersectionSetBuilder() {};
+	
+	/*
+	 * Takes a file generated by public String ruleFormer(String dataFile)
+	 * and performs verification of coverage for positive and negative set, as well as dedupe of rules
+	 * The input for negative positive data set is the same as the above function.
+	 * The second argument is the rule file generated by the above.
+	 * Outputs the verified rule file.
+	 */
+
+	public void ruleVerifier(String dataFile, String ruleFile){
+
+
+		List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile); 
+		List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive")); 
+		distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();
+		distProcessorNeg.initParamMap( 	fieldsToAggr, negativeSet.get(0));		
+		distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));		
+		negativeSet.remove(0); positiveSet.remove(0);
+		
+		List<String[]> ruleStrings = ProfileReaderWriter.readProfiles(ruleFile);
+		List<Map<String, String>> rules = new ArrayList<Map<String, String>>(), dedupedRules = new ArrayList<Map<String, String>>() ;
+		for(String[] l : ruleStrings){
+			Map<String, String> rule = new HashMap<String, String>();
+			String lstr = l[0].substring(1, l[0].length()-1);
+			String[] ruleStr= lstr.split(",");
+			for(String attr_valueStr: ruleStr){
+				String[] attr_value =  attr_valueStr.split("=");	
+				if (attr_value.length==2)
+					rule.put(attr_value[0].trim(), attr_value[1].trim());
+				else if (attr_value.length==1)
+					rule.put(attr_value[0].trim(),"");
+				else
+					System.err.println("Problem parsing rule file "+lstr);
+			}
+			rules.add(rule);
+		}
+		
+		
+		for(int i=0; i<rules.size(); i++){
+			boolean bCovered = false;
+		
+			for(int j=i+1; j<rules.size(); j++){
+				if (distProcessorNeg.ruleCoversRule(rules.get(j), rules.get(i))){
+					bCovered = true;
+				}
+			}
+			if (!bCovered)
+				dedupedRules.add(rules.get(i));
+		}
+		
+		rules = dedupedRules;
+
+		List<String[]> output = new ArrayList<String[]>();
+		output.add(new String[]{"rule", "# covers positive", "# covers negative"});
+		for(Map<String, String> rule: rules){
+			int countCoverNeg = 0, countCoverPos=0;
+			for(String[] line: positiveSet){
+				if (distProcessorPos.ruleCoversCase(rule, line)){
+					countCoverPos++;
+				}
+			}
+			for(String[] line: negativeSet){
+				if (distProcessorNeg.ruleCoversCase(rule, line)){
+					countCoverNeg++;
+				}
+
+			}
+			output.add(new String[]{rule.toString(), new Integer(countCoverPos).toString(), new Integer(countCoverNeg).toString()});	
+
+		}
+		ProfileReaderWriter.writeReport(output, ruleFile+"Verif1.csv");
+	}
+	
+	
+	/*
+	 * Takes one argument for negative training set file, assumes the positive filename is formed by replacing 'negative'->'positive'
+	 * Outputs the filename with generated rules
+	 * 
+	 */
+	public String ruleFormer(String dataFile){
+
+
+		List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile); 
+		List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive")); 
+		distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();
+		distProcessorNeg.initParamMap( 	fieldsToAggr, negativeSet.get(0));		
+		distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));		
+		negativeSet.remove(0); positiveSet.remove(0);
+
+		List<Map<String, String>> intersections = formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(negativeSet, positiveSet);
+		List<Map<String, String>> superIntersections = formIntersections(intersections, negativeSet, positiveSet);
+
+		List<String[]> output = new ArrayList<String[]>();
+		for(Map<String, String> rule: superIntersections){
+			int countCover = 0;
+			for(String[] line: positiveSet){
+				if (distProcessorPos.ruleCoversCase(rule, line)){
+					countCover++;
+				}
+			}
+			output.add(new String[]{rule.toString(), new Integer(countCover).toString()});	
+
+		}
+		String outputFile = "learnedRulesForNegativeSetJune23-1.csv";
+		ProfileReaderWriter.writeReport(output, outputFile);
+		return outputFile; 
+
+	}
+
+	private List<Map<String, String>> formIntersections(List<Map<String, String>> intersectionsIn, List<String[]> negativeSet, List<String[]> positiveSet) {
+		List<Map<String, String>> intersectionsNew = new ArrayList<Map<String, String>>();
+		for(int i=0; i<intersectionsIn.size(); i++){
+			for(int j=i+1; j<intersectionsIn.size(); j++){
+				Map<String, String> intersection = distProcessorNeg.computeIntersection(intersectionsIn.get(i), intersectionsIn.get(j));
+				if (intersection.isEmpty())
+					continue;
+				
+				int countCover = 0;
+				for(String[] line: positiveSet){
+					if (distProcessorPos.ruleCoversCase(intersection, line)){
+						//countCover++;
+						countCover = 10000000;
+						break;
+					}
+				}
+				float cover = (float)countCover/(float)positiveSet.size();
+				if (!(cover<this.percentageOfAllowedSetCover))
+					continue;
+
+				List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();
+				boolean nothingCoversThisRule = true;
+				for(Map<String, String> intersChecker: intersectionsIn){ // more general rule covers more specific
+					if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){
+						nothingCoversThisRule = false;
+						break;
+					} // now check if this new rule defeats built rules
+					if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){
+						rulesToBeRemoved.add(intersChecker); 
+					}
+				}
+				if(nothingCoversThisRule){
+					intersectionsNew.add(intersection);
+					intersectionsNew.removeAll(rulesToBeRemoved);
+				}
+			}
+		}
+		intersectionsNew.addAll(intersectionsIn);
+		return intersectionsNew;
+	}
+
+	private List<Map<String, String>> formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(List<String[]> negativeSet, List<String[]> positiveSet){
+		List<Map<String, String>> intersections = new ArrayList<Map<String, String>>();
+
+		for(int i=0; i<negativeSet.size() && i<1000; i++){
+			for(int j=i+1; j<negativeSet.size(); j++){
+				Map<String, String> intersection = distProcessorNeg.computeIntersection(negativeSet.get(i), negativeSet.get(j));
+				if (intersection.isEmpty())
+					continue;
+				
+				/* temporary code that formed rule covers at least 2 cases
+				int countCoverNeg=0;
+				for(String[] line: negativeSet){
+					if (distProcessorNeg.ruleCoversCase(intersection, line)){
+						countCoverNeg++;
+					}
+
+				} 
+				if (countCoverNeg<2){
+					System.err.println("A rule formed but it does not cover its origin! "+intersection);
+					distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(i));
+					distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(j));
+				} */
+				
+				
+				
+				int countCover = 0;
+				for(String[] line: positiveSet){
+					if (distProcessorPos.ruleCoversCase(intersection, line)){
+						//countCover++;
+						countCover = 10000000;
+						break;
+					}
+				}
+				float cover = (float)countCover/(float)positiveSet.size();
+				if (!(cover<this.percentageOfAllowedSetCover))
+					continue;
+
+				List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();
+				boolean nothingCoversThisRule = true;
+				for(Map<String, String> intersChecker: intersections){ // more general rule covers more specific
+					if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){
+						nothingCoversThisRule = false;
+						break;
+					} // now check if this new rule defeats built rules
+					if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){
+						rulesToBeRemoved.add(intersChecker); 
+					}
+				}
+				if(nothingCoversThisRule){
+					intersections.add(intersection);
+					intersections.removeAll(rulesToBeRemoved);
+				}
+			}
+		}
+		return intersections;
+	}
+
+	private List<Map<String, String>> filterIntersectionsByOppositeTrainingSet(List<Map<String, String>> intersections, List<String[]> positiveSet){
+		List<Map<String, String>> filteredIntersections = new ArrayList<Map<String, String>>();
+		for(Map<String, String> rule: intersections){
+			int countCover = 0;
+			for(String[] line: positiveSet){
+				if (!distProcessorPos.ruleCoversCase(rule, line))
+					countCover++;
+			}
+			if ((float)countCover/(float)positiveSet.size()<this.percentageOfAllowedSetCover)
+				filteredIntersections.add(rule);
+
+		}
+		return filteredIntersections;
+	}
+
+    public boolean applyRule(String[] sample){
+    	return true;
+    	// todo implement singleton which reads rule file and applies them
+    	
+    }
+
+	public static void main(String[] args){
+		IntersectionSetBuilder iBuilder = new IntersectionSetBuilder ();
+		
+		// builds the set of rules
+	    String resFile = iBuilder.ruleFormer("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv");
+		// verifies and cleans the rules
+		iBuilder.ruleVerifier("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv", 
+				"C:/workspace/relevanceEngine/learnedRulesForNegativeSetJune23-1.csv");
+
+	}
+
+}
+
+/*
+ * 
+ * datetime
+browser_language
+browser_string
+device_first_seen
+device_match_result
+http_os_signature
+http_os_sig_raw
+os
+device_id_reason_code
+true_ip
+proxy_ip
+http_os_sig_adv_mss
+http_os_sig_snd_mss
+http_os_sig_rcv_mss
+http_os_sig_ttl
+http_connection_type
+device_last_event
+flash_lang
+flash_os
+flash_version
+os_fonts_number
+plugin_adobe_acrobat
+plugin_flash
+plugin_silverlight
+plugin_windows_media_player
+profiling_datetime
+screen_res
+tcp_os_signature
+tcp_os_sig_raw
+time_zone
+time_zone_dst_offset
+profile_api_timedelta
+mime_type_number
+plugin_number
+plugin_quicktime
+plugin_java
+fuzzy_device_id_confidence
+fuzzy_device_match_result
+fuzzy_device_last_event
+fuzzy_device_first_seen
+true_ip_city
+true_ip_first_seen
+true_ip_geo
+true_ip_latitude
+true_ip_longitude
+account_email_first_seen
+shipping_address_first_seen
+tcp_os_ sig_ttl
+tcp_connection_type
+page_time_on
+policy_score
+reason_code
+review_status
+risk_rating
+ */

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import au.com.bytecode.opencsv.CSVReader;
+import au.com.bytecode.opencsv.CSVWriter;
+
+public class ProfileReaderWriter {
+	public static List<String[]> readProfiles(String filename) {
+		CSVReader reader = null;
+		List<String[]> profiles = null;
+		try	{
+			reader = new CSVReader(new FileReader(filename), ',');
+			profiles = reader.readAll();
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+		} catch (IOException ioe) {
+			ioe.printStackTrace();
+		}
+		return profiles;
+	}
+	
+	public static List<String[]> readProfiles(String filename, char delimiter) {
+		CSVReader reader = null;
+		List<String[]> profiles = null;
+		try	{
+			reader = new CSVReader(new FileReader(filename), delimiter);
+			profiles = reader.readAll();
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+		} catch (IOException ioe) {
+			ioe.printStackTrace();
+		}
+		return profiles;
+	}
+
+	public static void writeReportArr( String[][] allLines, String reportName){
+		List<String[]> rep = new ArrayList<String[]>();
+		for(String[] line: allLines){
+			rep.add(line);
+		}
+		writeReport( rep, reportName);
+	}
+
+	public static void writeReport( List<String[]> allLines, String reportName){
+		CSVWriter writer = null;
+		try {	
+			writer = new CSVWriter(new PrintWriter(reportName));			
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}		
+		writer.writeAll(allLines);
+
+		try {
+			writer.flush();
+			writer.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+
+	public static void writeReport( List<String[]> allLines, String reportName, char delimiter){
+		CSVWriter writer = null;
+		try {	
+			writer = new CSVWriter(new PrintWriter(reportName), delimiter, delimiter, delimiter);			
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}	
+
+		writer.writeAll(allLines);
+
+		try {
+			writer.flush();
+			writer.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+	
+	public static void appendReport( List<String[]> allLines, String reportName, char delimiter){
+		List<String[]> previous;
+		try {
+			previous = readProfiles(reportName);
+			allLines.addAll(previous);
+		} catch (Exception e1) {
+			System.out.println("Creating file "+reportName);
+		}
+		
+		CSVWriter writer = null;
+		try {	
+			writer = new CSVWriter(new PrintWriter(reportName), delimiter, delimiter, delimiter);			
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}	
+
+		writer.writeAll(allLines);
+
+		try {
+			writer.flush();
+			writer.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+
+	public static void writeReportListStr(List<String> res, String string) {
+		// TODO Auto-generated method stub
+
+	}
+
+	public static void main(String[] args){
+		List<String[]> allLines = new ArrayList<String[]>();
+		allLines.add(new String[] {"aa " , "  bb", "ccc" });
+		ProfileReaderWriter.writeReport( allLines, "reportName.txt", ' ');
+
+	}
+
+
+}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.jsmlearning;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+public class TreeKernelRunner {
+	private void runEXE(String[] command, String runPath){
+		Runtime r = Runtime.getRuntime();
+		Process mStartProcess = null;
+		try {
+			mStartProcess = r.exec( command, null, new File(runPath));
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+		StreamLogger outputGobbler = new StreamLogger(mStartProcess.getInputStream());
+		outputGobbler.start();
+
+		try {
+			int returnCode = mStartProcess.waitFor();
+		} catch (InterruptedException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+
+	public void runLearner(String dir, String learning_file, String  model_file)
+	{
+		dir = dir.replace('/', '\\');
+		
+		if (!dir.endsWith("\\"))
+				dir+="\\";
+		String[] runString = new String[]{dir+"svm_learn.exe","-t", "5", dir+learning_file,  dir+model_file};
+		runEXE(runString, dir);
+	}
+	
+	
+	//svm_classify example_file model_file predictions_file
+	public void runClassifier(String dir, String example_file, String  model_file, String predictions_file)
+	{
+		dir = dir.replace('/', '\\');
+		
+		if (!dir.endsWith("\\"))
+				dir+="\\";
+		String[] runString = new String[]{dir+"svm_classify.exe", dir+example_file,  dir+model_file, dir+predictions_file};
+		runEXE(runString, dir);
+	}
+
+	class StreamLogger extends Thread{
+
+		private InputStream mInputStream;
+
+		public StreamLogger(InputStream is) {
+			this.mInputStream = is;
+		}
+
+		public void run() {
+			try {
+				InputStreamReader isr = new InputStreamReader(mInputStream);
+				BufferedReader br = new BufferedReader(isr);
+				String line = null;
+				while ((line = br.readLine()) != null) {
+					System.out.println(line);
+				}
+			} catch (IOException ioe) {
+				ioe.printStackTrace();
+			}
+		}
+
+	}
+	
+	public static void main(String[] args){
+		TreeKernelRunner runner = new TreeKernelRunner();
+		runner.runLearner("C:\\stanford-corenlp\\tree_kernel\\", "training.txt", "arg0.model1.txt");
+		runner.runClassifier("C:\\stanford-corenlp\\tree_kernel\\", "arg0.test", "arg0.model1.txt", "arg0.output1.txt");
+	}
+}
+
+	/*
+exec:
+
+public Process exec(String command, String envp[], File dir) 
+
+
+
+   @param      command   a specified system command.
+   @param      envp      array of strings, each element of which 
+                         has environment variable settings in format
+                         <i>name</i>=<i>value</i>.
+   @param      dir       the working directory of the subprocess, or
+                         <tt>null</tt> if the subprocess should inherit
+                         the working directory of the current process.
+
+                         В ди�трибутиве два exe-файла: svm_learn.exe и svm_classify.exe.
+
+1.   svm_learn.exe берет файл � примерами, обрабатывает его, �троит файл model м правилами обучение.
+
+Примеры запу�ка: 
+svm_learn -t 5 learning_file model_file - �то �амый про�той вариант запу�ка, SubSetTreeKernel (допу�кают�� разрывы при обходе деревьев)
+
+svm_learn -t 5 -D 0 learning_file model_file - другой вариант �дра, SubTreeKernel
+
+Пример файла лежит на его �траничке. Там же опи�ание параметров.
+
+2. svm_classify.exe берет файл � те�товыми примерами, файл � моделью, по�троенный svm_learn, и запи�ывает результаты обучени� в файл predictions_file.
+
+Запу�к:     svm_classify example_file model_file predictions_file
+
+Файл имеет тот же формат, что и входные примеры. Образец лежит в архиве на �траничке Мо�китти. 
+Можно Ñ�разу же указывать, к какому клаÑ�Ñ�у отноÑ�итÑ�Ñ� пример (1 или -1 в начале Ñ�троки). Ð’ Ñ�том Ñ�лучае точноÑ�Ñ‚ÑŒ и полнота оцениваютÑ�Ñ� автоматичÐ
 µÑ�ки. Или Ñ�тавить там 0.
+	 */
\ No newline at end of file

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java Mon Jan  6 17:48:30 2014
@@ -16,6 +16,7 @@
  */
 package opennlp.tools.nl2code;
 
+import java.io.File;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -28,11 +29,19 @@ import opennlp.tools.textsimilarity.chun
 public class NL2Obj {
   ObjectControlOp prevOp;
 
-  public NL2Obj() {
+  public NL2Obj(String path) {
     prevOp = new ObjectControlOp();
     prevOp.setOperatorIf("");
     prevOp.setOperatorFor("");
+    parser = ParserChunker2MatcherProcessor.getInstance(path);
   }
+  
+  public NL2Obj() {
+	    prevOp = new ObjectControlOp();
+	    prevOp.setOperatorIf("");
+	    prevOp.setOperatorFor("");
+	    parser = ParserChunker2MatcherProcessor.getInstance();
+	  }
 
   public static String[] epistemicStatesList = new String[] {
     "select", "verify", "find", "start", "stop", "go", "check"
@@ -268,6 +277,9 @@ public class NL2Obj {
 
 
   public static void main(String[] args){
+	  
+	String cDir = new File(".").getAbsolutePath();
+	
     String[] text = new String[]{
         "Randomly select a pixel at an image.",
         "Find a convex area this pixel belongs, so that all pixels are less than 128",      //area->REGION

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java Mon Jan  6 17:48:30 2014
@@ -25,120 +25,129 @@ import opennlp.tools.textsimilarity.chun
 
 public class NL2ObjCreateAssign extends NL2Obj {
 
-  private boolean classBeingDefined = false;
-  public static String[] declarationStatesList = new String[] {
-    "create", "assign", "set", 
-  };
-
-  public static String[] dataTypesList = new String[] {
-    "text", "double", "array", 
-  };
-
-  public static String[] arrayElementList = new String[] {
-    "first", "second", "third", "fourth" 
-  };
-
-  public static String[] arrayElementListInsdex = new String[] {
-    "0", "1", "2", "3" 
-  };
-
-
-  @Override
-  public ObjectPhraseListForSentence convertSentenceToControlObjectPhrase(String sentence){
-    String expression = null;
-    if (sentence.indexOf(":")>-1){
-      expression = sentence.split(":")[1];
-      sentence = sentence.split(":")[0]+".";
-    }
-
-
-    List<ObjectPhrase> oPhrases = new  ArrayList<ObjectPhrase>();
-    parser = ParserChunker2MatcherProcessor.getInstance();
-    List<List<ParseTreeChunk>> lingPhrases = 
-      parser.formGroupedPhrasesFromChunksForSentence(sentence);
-
-    ObjectControlOp op = extractControlPart(lingPhrases, prevOp);
-    prevOp = op;
-
-    //start with verb phrases
-    List<ParseTreeChunk> actionWithObject =  lingPhrases.get(1);
-    actionWithObject.addAll( lingPhrases.get(4));
-
-    System.out.println("      === "+actionWithObject);
-
-    for(ParseTreeChunk verbChunk: actionWithObject){
-      List<String> lems = verbChunk.getLemmas();
-      String declarativeAction = verbChunk.getLemmas().get(0).toLowerCase();
-      if (declarativeAction.equals("define")){
-        if (verbChunk.getLemmas().get(1).toLowerCase().equals("class") ||
-            verbChunk.getLemmas().get(2).toLowerCase().equals("class")){
-          // new class
-          String className = verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase();
-          className = className.substring(0, 1).toUpperCase()+className.substring(1, className.length());
-          op.setOperatorIf("class "+className + "{");
-          op.setOperatorFor("{");
-          classBeingDefined = true;
-          break;
-        }
-        String dataType = verbChunk.getLemmas().get(1).toLowerCase();
-
-        if (classBeingDefined && Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
-          op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
-          classBeingDefined = true;
-          break;
-        }
-        if (Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
-          op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
-          classBeingDefined = true;
-          break;
-        }
-      } else if (declarativeAction.equals("create")){
-
-        // now substituting array
-        if (verbChunk.getLemmas().get(1).toLowerCase().equals("array")){
-
-          if(lems.contains("class")){
-            int indClass = lems.indexOf("class");
-            int numElements = lems.indexOf("elements");
-            if (numElements<0)
-              numElements = lems.indexOf("objects");
-            if (numElements<0)
-              numElements = lems.indexOf("members");
-            String arraySize = lems.get(numElements-1);
-            op.setOperatorFor(lems.get(indClass+1)+"[] "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase() 
-                +" = new "+lems.get(indClass+1)+"["+arraySize+"]");
-            classBeingDefined = false;
-            break;
-          }
-        }    
-      } else if (declarativeAction.equals("assign")){
-        int numElements = lems.indexOf("element");
-        if (numElements<0)
-          numElements = lems.indexOf("object");
-        if (numElements<0)
-          numElements = lems.indexOf("member");
-        if (Arrays.asList(arrayElementList).contains(lems.get(numElements-1))){
-          int arrIndex = Arrays.asList(arrayElementList).indexOf(lems.get(numElements-1));
-          String indexValue = arrayElementListInsdex[arrIndex]; 
-
-          String arrayName = lems.get(lems.size()-1);
-          if (expression!=null)
-            op.setOperatorFor(arrayName+"["+indexValue+"]."+ expression);
-          break;
-        } 
-      } else if (declarativeAction.equals("set")){
-        int indQuantifier = lems.indexOf("all");
-        if (indQuantifier>-1 && 
-            (lems.get(indQuantifier+1).equals("elements") || lems.get(indQuantifier+1).equals("members") )){
-          
-          String arrayName = lems.get(lems.size()-1);
-          if (expression!=null)
-            op.setOperatorFor("for(int i=0; i<"+ arrayName+".size(); i++) "+
-                arrayName+"[i]."+ expression);
-          break;
-        } 
-      }
-      /*    
+	private boolean classBeingDefined = false;
+	public static String[] declarationStatesList = new String[] {
+		"create", "assign", "set", 
+	};
+
+	public static String[] dataTypesList = new String[] {
+		"text", "double", "array", 
+	};
+
+	public static String[] arrayElementList = new String[] {
+		"first", "second", "third", "fourth" 
+	};
+
+	public static String[] arrayElementListInsdex = new String[] {
+		"0", "1", "2", "3" 
+	};
+
+
+
+	public NL2ObjCreateAssign() {
+		super();
+	}
+
+	public NL2ObjCreateAssign(String path) {
+		super(path);
+	}
+
+	@Override
+	public ObjectPhraseListForSentence convertSentenceToControlObjectPhrase(String sentence){
+		String expression = null;
+		if (sentence.indexOf(":")>-1){
+			expression = sentence.split(":")[1];
+			sentence = sentence.split(":")[0]+".";
+		}
+
+
+		List<ObjectPhrase> oPhrases = new  ArrayList<ObjectPhrase>();
+		parser = ParserChunker2MatcherProcessor.getInstance();
+		List<List<ParseTreeChunk>> lingPhrases = 
+				parser.formGroupedPhrasesFromChunksForSentence(sentence);
+
+		ObjectControlOp op = extractControlPart(lingPhrases, prevOp);
+		prevOp = op;
+
+		//start with verb phrases
+		List<ParseTreeChunk> actionWithObject =  lingPhrases.get(1);
+		actionWithObject.addAll( lingPhrases.get(4));
+
+		System.out.println("      === "+actionWithObject);
+
+		for(ParseTreeChunk verbChunk: actionWithObject){
+			List<String> lems = verbChunk.getLemmas();
+			String declarativeAction = verbChunk.getLemmas().get(0).toLowerCase();
+			if (declarativeAction.equals("define")){
+				if (verbChunk.getLemmas().get(1).toLowerCase().equals("class") ||
+						verbChunk.getLemmas().get(2).toLowerCase().equals("class")){
+					// new class
+					String className = verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase();
+					className = className.substring(0, 1).toUpperCase()+className.substring(1, className.length());
+					op.setOperatorIf("class "+className + "{");
+					op.setOperatorFor("{");
+					classBeingDefined = true;
+					break;
+				}
+				String dataType = verbChunk.getLemmas().get(1).toLowerCase();
+
+				if (classBeingDefined && Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
+					op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
+					classBeingDefined = true;
+					break;
+				}
+				if (Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){
+					op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());
+					classBeingDefined = true;
+					break;
+				}
+			} else if (declarativeAction.equals("create")){
+
+				// now substituting array
+				if (verbChunk.getLemmas().get(1).toLowerCase().equals("array")){
+
+					if(lems.contains("class")){
+						int indClass = lems.indexOf("class");
+						int numElements = lems.indexOf("elements");
+						if (numElements<0)
+							numElements = lems.indexOf("objects");
+						if (numElements<0)
+							numElements = lems.indexOf("members");
+						String arraySize = lems.get(numElements-1);
+						op.setOperatorFor(lems.get(indClass+1)+"[] "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase() 
+								+" = new "+lems.get(indClass+1)+"["+arraySize+"]");
+						classBeingDefined = false;
+						break;
+					}
+				}    
+			} else if (declarativeAction.equals("assign")){
+				int numElements = lems.indexOf("element");
+				if (numElements<0)
+					numElements = lems.indexOf("object");
+				if (numElements<0)
+					numElements = lems.indexOf("member");
+				if (Arrays.asList(arrayElementList).contains(lems.get(numElements-1))){
+					int arrIndex = Arrays.asList(arrayElementList).indexOf(lems.get(numElements-1));
+					String indexValue = arrayElementListInsdex[arrIndex]; 
+
+					String arrayName = lems.get(lems.size()-1);
+					if (expression!=null)
+						op.setOperatorFor(arrayName+"["+indexValue+"]."+ expression);
+					break;
+				} 
+			} else if (declarativeAction.equals("set")){
+				int indQuantifier = lems.indexOf("all");
+				if (indQuantifier>-1 && 
+						(lems.get(indQuantifier+1).equals("elements") || lems.get(indQuantifier+1).equals("members") )){
+
+					String arrayName = lems.get(lems.size()-1);
+					if (expression!=null)
+						op.setOperatorFor("for(int i=0; i<"+ arrayName+".size(); i++) "+
+								arrayName+"[i]."+ expression);
+					break;
+				} 
+			}
+			/*    
         else {
           List<String> paramValues = verbChunk.getLemmas(), paramPOSs = verbChunk.getPOSs();
 
@@ -205,40 +214,40 @@ public class NL2ObjCreateAssign extends 
         oPhrases.add(oPhrase);      
 
       } */
-    }
+		}
 
-    ObjectPhraseListForSentence oplfs =  new ObjectPhraseListForSentence( oPhrases, op);
-    oplfs.cleanMethodNamesIsAre();
-    oplfs.substituteNullObjectIntoEmptyArg();
-      
-    return oplfs;
-  }
-
-  public static void main(String[] args){
-
-    String[] text = new String[]{
-        "Define a class and name it Employee. ",
-        "Define text attribute and name it m_name. ",
-        "Define double attribute and name it m_salary.",
-        "Create array of objects of class Employee for 10 elements, name the object as workforce.",
-        "Assign the first element in array workforce: m_name=\"Boss\"",
-        "Assign the second element in array workforce: m_name=\"His wife\"",
-       //  "Comment: We just started our small business company and expect to hire 8 more people soon.",
-        "Set for all elements in array workforce: m_salary=0 ",
-        "Print the list of all m_name attributes for workforce."
-
-    };
-
-    NL2Obj compiler = new NL2ObjCreateAssign();
-    for(String sent:text){
-      ObjectPhraseListForSentence opls=null;
-      try {
-        opls = compiler.convertSentenceToControlObjectPhrase(sent);
-      } catch (Exception e) {
-        e.printStackTrace();
-      }
-      System.out.println(sent+"\n"+opls+"\n");
-    }
+		ObjectPhraseListForSentence oplfs =  new ObjectPhraseListForSentence( oPhrases, op);
+		oplfs.cleanMethodNamesIsAre();
+		oplfs.substituteNullObjectIntoEmptyArg();
+
+		return oplfs;
+	}
+
+	public static void main(String[] args){
+
+		String[] text = new String[]{
+				"Define a class and name it Employee. ",
+				"Define text attribute and name it m_name. ",
+				"Define double attribute and name it m_salary.",
+				"Create array of objects of class Employee for 10 elements, name the object as workforce.",
+				"Assign the first element in array workforce: m_name=\"Boss\"",
+				"Assign the second element in array workforce: m_name=\"His wife\"",
+				//  "Comment: We just started our small business company and expect to hire 8 more people soon.",
+				"Set for all elements in array workforce: m_salary=0 ",
+				"Print the list of all m_name attributes for workforce."
+
+		};
+
+		NL2Obj compiler = new NL2ObjCreateAssign();
+		for(String sent:text){
+			ObjectPhraseListForSentence opls=null;
+			try {
+				opls = compiler.convertSentenceToControlObjectPhrase(sent);
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+			System.out.println(sent+"\n"+opls+"\n");
+		}
 
-  }
+	}
 }

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,54 @@
+package opennlp.tools.parse_thicket;
+
+public class ArcType{
+	private String type; // rst
+	private String subtype; // rst-explain
+	private Integer type_id;
+	private Integer subtype_id;
+	
+	public ArcType(String type, // rst
+	String subtype, // rst-explain
+	Integer type_id,
+	Integer subtype_id){
+		this.type = type; // rst
+		this.subtype = subtype; // rst-explain
+		this.type_id= type_id;
+		this.subtype_id = subtype_id;
+	}
+
+	public String getType() {
+		return type;
+	}
+
+	public void setType(String type) {
+		this.type = type;
+	}
+
+	public String getSubtype() {
+		return subtype;
+	}
+
+	public void setSubtype(String subtype) {
+		this.subtype = subtype;
+	}
+
+	public Integer getType_id() {
+		return type_id;
+	}
+
+	public void setType_id(Integer type_id) {
+		this.type_id = type_id;
+	}
+
+	public Integer getSubtype_id() {
+		return subtype_id;
+	}
+
+	public void setSubtype_id(Integer subtype_id) {
+		this.subtype_id = subtype_id;
+	}
+	
+	public String toString(){
+		return type+":"+subtype;
+	}
+}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,12 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.List;
+
+public interface IGeneralizer<T> {
+	/* All objects such as words, ParseTreeNodes, Phrases, Communicative actions etc. are subject to 
+	 * generalization, so should implement this interface
+	 * 
+	 * In this project Everything is subject to generalization, and returns a list of generic objects
+	 */
+   public List<T> generalize(Object o1, Object o2);
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,89 @@
+package opennlp.tools.parse_thicket;
+
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.stanford.nlp.trees.LabeledScoredTreeNode;
+import edu.stanford.nlp.trees.SimpleTree;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeFactory;
+
+
+
+public class PTTree extends SimpleTree {
+	
+	public PTTree(){
+		super();
+	}
+
+	public PTTree(Tree t){
+		super();
+	}
+	private static final long serialVersionUID = 1L;
+
+	@Override
+	public PTTree[] children() {
+		return children();
+	}
+
+	@Override
+	public TreeFactory treeFactory() {
+		// TODO Auto-generated method stub
+		return null;
+	}
+	
+	public void doNavigate(){
+		List<LabeledScoredTreeNode> phrases = new ArrayList<LabeledScoredTreeNode>();
+		navigate(0, false, false, false, true, true, phrases);
+	}
+	
+	private static void navigateChildren(PTTree[] trChildren, int indent, boolean parentLabelNull, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+	    boolean firstSibling = true;
+	    boolean leftSibIsPreTerm = true;  // counts as true at beginning
+	    for (PTTree currentTree : trChildren) {
+	      currentTree.navigate(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, onlyLabelValue, phrases);
+	      leftSibIsPreTerm = currentTree.isPreTerminal();
+	      // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting
+	      if (currentTree.value() != null && currentTree.value().startsWith("CC")) {
+	        leftSibIsPreTerm = false;
+	      }
+	      firstSibling = false;
+	    }
+	  }
+	
+	/**
+	   * navigate parse tree
+	   */
+	  private void navigate(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+	    // the condition for staying on the same line in Penn Treebank
+	    boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));
+	    if (suppressIndent) {
+	      //pw.print(" ");
+	      // pw.flush();
+	    } else {
+	      if (!topLevel) {
+	        //pw.println();
+	      }
+	      for (int i = 0; i < indent; i++) {
+	        //pw.print("  ");
+	        // pw.flush();
+	      }
+	    }
+	    if (isLeaf() || isPreTerminal()) {
+	      String terminalString = toStringBuilder(new StringBuilder(), onlyLabelValue).toString();
+	      //pw.print(terminalString);
+	      //pw.flush();
+	      return;
+	    }
+	    //pw.print("(");
+	    String nodeString = onlyLabelValue ? value() : nodeString();
+	    //pw.print(nodeString);
+	    // pw.flush();
+	    boolean parentIsNull = label() == null || label().value() == null;
+	    navigateChildren(children(), indent + 1, parentIsNull, true, phrases);
+	    //pw.print(")");
+	    
+	  }
+
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket;
+
+import java.util.Comparator;
+
+/**
+ * Generic pair class for holding two objects. Often used as return object.
+ * 
+ * @author Albert-Jan de Vries
+ * 
+ * @param <T1>
+ * @param <T2>
+ */
+public class Pair<T1, T2> {
+  private T1 first;
+
+  private T2 second;
+
+  public Pair() {
+
+  }
+
+  public Pair(T1 first, T2 second) {
+    this.first = first;
+    this.second = second;
+  }
+
+  public T1 getFirst() {
+    return first;
+  }
+
+  public void setFirst(T1 first) {
+    this.first = first;
+  }
+
+  public T2 getSecond() {
+    return second;
+  }
+
+  public void setSecond(T2 second) {
+    this.second = second;
+  }
+  
+  public class PairComparable implements Comparator<Pair<T1, T2>> {
+    // @Override
+    public int compare(Pair o1, Pair o2) {
+      int b = -2;
+      if ( o1.second instanceof Float && o2.second instanceof Float){
+        
+        b =  (((Float)o1.second > (Float)o2.second) ? -1
+          : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+      }
+      return b;
+    }
+  }
+  public String toString(){
+	  return this.first.toString()+" "+this.second.toString();
+  }
+  
+}
+

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,191 @@
+package opennlp.tools.parse_thicket;
+
+import java.io.*;
+import java.util.*;
+
+import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;
+
+import edu.stanford.nlp.dcoref.CorefChain;
+import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
+import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
+import edu.stanford.nlp.ling.*;
+import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.pipeline.*;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.util.*;
+
+public class ParseCorefsBuilder {
+	protected static ParseCorefsBuilder instance;
+	private Annotation annotation;
+	StanfordCoreNLP pipeline;
+	CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();
+	
+	  /**
+	   * singleton method of instantiating the processor
+	   * 
+	   * @return the instance
+	   */
+	  public synchronized static ParseCorefsBuilder getInstance() {
+	    if (instance == null)
+	      instance = new ParseCorefsBuilder();
+
+	    return instance;
+	  }
+	
+	ParseCorefsBuilder(){
+		Properties props = new Properties();
+		props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
+		pipeline = new StanfordCoreNLP(props);
+	}
+	
+	public ParseThicket buildParseThicket(String text){
+		List<Tree> ptTrees = new ArrayList<Tree>();
+		// all numbering from 1, not 0
+		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+		List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
+		
+		annotation = new Annotation(text);
+		try {
+			pipeline.annotate(annotation);
+			List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
+			if (sentences != null && sentences.size() > 0) 
+			for(CoreMap sentence: sentences){
+				List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
+				
+				// traversing the words in the current sentence
+			    // a CoreLabel is a CoreMap with additional token-specific methods
+				Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
+				List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
+				int count=1;
+			    for (CoreLabel token: coreLabelList ) {
+			      // this is the text of the token
+			      String lemma = token.get(TextAnnotation.class);
+			      // this is the POS tag of the token
+			      String pos = token.get(PartOfSpeechAnnotation.class);
+			      // this is the NER label of the token
+			      String ne = token.get(NamedEntityTagAnnotation.class);     
+			      nodes.add(new ParseTreeNode(lemma, pos, ne, count));
+			      count++;
+			    }	
+			    nodesThicket.add(nodes);
+			  Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
+			  ptTrees.add(tree);
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	    
+	  
+	    // now coreferences
+	    Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
+	    List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
+	    for(CorefChain c: chains){
+	      //System.out.println(c);
+	      List<CorefMention> mentions = c.getMentionsInTextualOrder();
+	      //System.out.println(mentions);
+	      if (mentions.size()>1)
+	      for(int i=0; i<mentions.size(); i++){
+	    	  for(int j=i+1; j<mentions.size(); j++){
+	    	  CorefMention mi = mentions.get(i), mj=mentions.get(j);
+	    	  
+	    	  
+	    	  int niSentence = mi.position.get(0);
+	    	  int niWord = mi.startIndex;
+	    	  int njSentence = mj.position.get(0);
+	    	  int njWord = mj.startIndex;
+	    	  
+	    	  ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);
+	    	  
+	    	  WordWordInterSentenceRelationArc arc = 
+	    			  new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord), 
+	    					  new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan, 
+	    					  arcType);
+	    	  arcs.add(arc);
+	    	  
+	    	  /*
+	    	  System.out.println("animacy = "+m.animacy);
+	    	  System.out.println("mention span = "+m.mentionSpan);
+	    	  System.out.println(" id = "+m.mentionID);
+	    	  System.out.println(" position = "+m.position);
+	    	  System.out.println(" start index = "+m.startIndex);
+	    	  System.out.println(" end index = "+m.endIndex);   
+	    	  System.out.println(" mentionType = "+m.mentionType);   
+	    	  System.out.println(" number =  = "+m.number);  
+	    	  */
+	    	  }
+	      }
+	      
+	      
+	    }
+	    List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
+	    
+	    ParseThicket result = new ParseThicket(ptTrees, arcs);
+	    result.setNodesThicket(nodesThicket);
+	    return result;
+	}
+
+  private List<WordWordInterSentenceRelationArc> buildCAarcs(
+			List<List<ParseTreeNode>> nodesThicket) {
+	  List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+	  
+		for(int sentI=0; sentI<nodesThicket.size(); sentI++){
+			for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
+				List<ParseTreeNode> sentenceI = nodesThicket.get(sentI), 
+						sentenceJ = nodesThicket.get(sentJ);
+				Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
+				Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
+				int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
+				int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
+				if (caI==null || caJ==null)
+					continue;
+				Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);
+				
+				ArcType arcType = new ArcType("ca", 
+						caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
+				 WordWordInterSentenceRelationArc arc = 
+		    			  new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1), 
+		    					  new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(), 
+		    					  arcType);
+		    	  arcs.add(arc);
+				
+			}
+					}
+		
+		return arcs;
+	}
+  
+    private String printNumArray(Integer[] arr){
+    	StringBuffer buf = new StringBuffer();
+    	for(Integer i: arr){
+    		buf.append(Integer.toString(i)+ " ");
+    	}
+    	return buf.toString();
+    }
+
+public static void main(String[] args) throws IOException {
+	  ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
+	  ParseThicket  th = builder.buildParseThicket("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
+    		  "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
+    		  "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
+    		  "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");
+    //GraphFromPTreeBuilder gbuilder = new GraphFromPTreeBuilder();
+    //gbuilder.buildGraphFromPT(th);
+	 
+  }
+
+}
+
+/*
+ * [<sent=1-word=1..Iran> ===> <sent=3-word=9..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=1..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=4..its>, <sent=1-word=1..Iran> ===> <sent=4-word=17..it>, <sent=3-word=9..Iran> ===> <sent=4-word=1..Iran>, <sent=3-word=9..Iran> ===> <sent=4-word=4..its>, <sent=3-word=9..Iran> ===> <sent=4-word=17..it>, <sent=4-word=1..Iran> ===> <sent=4-word=4..its>, <sent=4-word=1..Iran> ===> <sent=4-word=17..it>, <sent=4-word=4..its> ===> <sent=4-word=17..it>, <sent=1-word=6..UN> ===> <sent=2-word=1..UN>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=10..its>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=13..its>, <sent=1-word=10..its> ===> <sent=1-word=13..its>, <sent=1-word=16..nuclear weapons> ===> <sent=3-word=14..nuclear weapons>, <sent=2-word=1..UN nuclear watchdog> ===> <sent=2-word=11..its>]
+
+[[[1]Iran:NNP>LOCATION, [2]refuses:VBZ>O, [3]to:TO>O, [4]accept:VB>O, [5]the:DT>O, [6]UN:NNP>ORGANIZATION, [7]proposal:NN>O, [8]to:TO>O, [9]end:VB>O, [10]its:PRP$>O, [11]dispute:NN>O, [12]over:IN>O, [13]its:PRP$>O, [14]work:NN>O, [15]on:IN>O, [16]nuclear:JJ>O, [17]weapons:NNS>O, [18].:.>O], 
+
+[[1]UN:NNP>ORGANIZATION, [2]nuclear:JJ>O, [3]watchdog:NN>O, [4]passes:VBZ>O, [5]a:DT>O, [6]resolution:NN>O, [7]condemning:VBG>O, [8]Iran:NNP>LOCATION, [9]for:IN>O, [10]developing:VBG>O, [11]its:PRP$>O, [12]second:JJ>ORDINAL, [13]uranium:NN>O, [14]enrichment:NN>O, [15]site:NN>O, [16]in:IN>O, [17]secret:NN>O, [18].:.>O], 
+
+[[1]A:DT>O, [2]recent:JJ>O, [3]IAEA:NNP>ORGANIZATION, [4]report:NN>O, [5]presented:VBD>O, [6]diagrams:NNS>O, [7]that:WDT>O, [8]suggested:VBD>O, [9]Iran:NNP>LOCATION, [10]was:VBD>O, [11]secretly:RB>O, [12]working:VBG>O, [13]on:IN>O, [14]nuclear:JJ>O, [15]weapons:NNS>O, [16].:.>O], 
+
+[[1]Iran:NNP>LOCATION, [2]envoy:NN>O, [3]says:VBZ>O, [4]its:PRP$>O, [5]nuclear:JJ>O, [6]development:NN>O, [7]is:VBZ>O, [8]for:IN>O, [9]peaceful:JJ>O, [10]purpose:NN>O, [11],:,>O, [12]and:CC>O, [13]the:DT>O, [14]material:NN>O, [15]evidence:NN>O, [16]against:IN>O, [17]it:PRP>O, [18]has:VBZ>O, [19]been:VBN>O, [20]fabricated:VBN>O, [21]by:IN>O, [22]the:DT>O, [23]US:NNP>LOCATION, [24].:.>O]]
+*/

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,59 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.List;
+
+import edu.stanford.nlp.trees.Tree;
+
+public class ParseThicket {
+	// parse trees 
+	private List<Tree> sentenceTrees;
+	// there should be an arc for each sentence
+	private List<WordWordInterSentenceRelationArc> arcs;
+	// lists of nodes for each sentence
+	// then list for all sentences
+	private List<List<ParseTreeNode>> sentenceNodes;
+	
+	public List<Tree> getSentences() {
+		return sentenceTrees;
+	}
+
+	public void setSentences(List<Tree> sentences) {
+		this.sentenceTrees = sentences;
+	}
+
+	public List<WordWordInterSentenceRelationArc> getArcs() {
+		return arcs;
+	}
+
+	public void setArcs(List<WordWordInterSentenceRelationArc> arcs) {
+		this.arcs = arcs;
+	}
+
+	public List<List<ParseTreeNode>> getNodesThicket() {
+		return sentenceNodes;
+	}
+
+	public void setNodesThicket(List<List<ParseTreeNode>> nodesThicket) {
+		this.sentenceNodes = nodesThicket;
+	}
+
+	public ParseThicket(String paragraph){
+		ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
+		ParseThicket res = builder.buildParseThicket(paragraph);
+		this.sentenceTrees= res.sentenceTrees;
+		this.arcs = res.arcs;		
+	}
+
+	public ParseThicket(List<Tree> ptTrees,
+			List<WordWordInterSentenceRelationArc> barcs) {
+		this.sentenceTrees= ptTrees;
+		this.arcs = barcs;				
+	}
+	
+	public String toString(){
+		return this.sentenceTrees+"\n"+this.arcs;
+	}
+	
+	
+	
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,153 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
+	String word;
+    // this is the POS tag of the token
+    String pos; 
+    // this is the NER label of the token
+    String ne; 
+    Integer id;
+    //PhraseType 
+    String phraseType;
+    
+    public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
+    	private PhraseType(final String text) {
+        this.text = text;
+    	}
+        private final String text;
+    
+    }
+    
+	public ParseTreeNode(String word, String pos, String ne, Integer id) {
+		super();
+		this.word = word;
+		this.pos = pos;
+		this.ne = ne;
+		this.id = id;
+	}
+	
+	public ParseTreeNode(String word, String pos) {
+		super();
+		this.word = word;
+		this.pos = pos;
+		this.ne = ne;
+		this.id = id;
+	}
+	
+	public String getPhraseType() {
+		return phraseType;
+	}
+	public void setPhraseType(String pt) {
+		this.phraseType=pt;
+	}
+	public String getWord() {
+		return word;
+	}
+	public void setWord(String word) {
+		this.word = word;
+	}
+	public String getPos() {
+		return pos;
+	}
+	public void setPos(String pos) {
+		this.pos = pos;
+	}
+	public String getNe() {
+		return ne;
+	}
+	public void setNe(String ne) {
+		this.ne = ne;
+	}
+	public Integer getId() {
+		return id;
+	}
+	public void setId(Integer id) {
+		this.id = id;
+	} 
+    
+	public String toString(){
+		StringBuffer buf = new StringBuffer();
+		if (id!=null)
+			buf.append("<"+id+">");
+		if(phraseType!=null)
+			buf.append(phraseType);
+		if(word!=null)
+			buf.append("'"+word+"'");
+		if (pos!=null)
+			buf.append(":"+pos);
+		return buf.toString();
+	}
+
+	@Override
+	public List<ParseTreeNode> generalize(Object o1, Object o2) {
+		List<ParseTreeNode> result = new ArrayList<ParseTreeNode>();
+		
+		ParseTreeNode w1 = (ParseTreeNode) o1;
+		ParseTreeNode w2 = (ParseTreeNode) o2;
+		String posGen =  generalizePOS(w1.pos, w2.pos);
+		if (posGen ==null)
+			return result;
+		ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
+				posGen, "O", -1);
+		result.add(newNode);
+		return result;
+	}
+	
+	public String generalizeWord(String lemma1, String lemma2){
+		if (lemma1.equals(lemma2))
+			return lemma1;
+		if (lemma1.equals("*"))
+			return "*";
+		if (lemma2.equals("*"))
+			return "*";
+		//TODO
+		return "*";
+		
+	}
+	
+	public String generalizePOS(String pos1, String pos2) {
+	    if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")
+	        && pos1.equals("NP"))) {
+	      return "NN";
+	    }
+	    if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")
+	        && pos1.equals("NN"))) {
+	      return "NN";
+	    }
+
+	    if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")
+	        && pos1.equals("ADJP"))) {
+	      return "NN";
+	    }
+	    if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
+	        && pos2.equals("IN"))) {
+	      return "IN";
+	    }
+	    // VBx vs VBx = VB (does not matter which form for verb)
+	    if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
+	      return "VB";
+	    }
+
+	    // ABx vs ABy always gives AB
+	    if (pos1.equalsIgnoreCase(pos2)) {
+	      return pos1;
+	    }
+	    if (pos1.length() > 2) {
+	      pos1 = pos1.substring(0, 2);
+	    }
+
+	    if (pos2.length() > 2) {
+	      pos2 = pos2.substring(0, 2);
+	    }
+	    if (pos1.equalsIgnoreCase(pos2)) {
+	      return pos1 + "*";
+	    }
+	    return null;
+	  }
+
+	
+};
+

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,49 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.Comparator;
+
+
+public class Triple<T1, T2, T3> {
+		  private T1 first;
+
+		  private T2 second;
+		  
+		  private T3 third;
+
+		  public Triple() {
+
+		  }
+
+		  public T1 getFirst() {
+		    return first;
+		  }
+
+		  public void setFirst(T1 first) {
+		    this.first = first;
+		  }
+
+		  public T2 getSecond() {
+		    return second;
+		  }
+
+		  public void setSecond(T2 second) {
+		    this.second = second;
+		  }
+
+		public Triple(T1 first, T2 second, T3 third) {
+			super();
+			this.first = first;
+			this.second = second;
+			this.third = third;
+		}
+
+		public T3 getThird() {
+			return third;
+		}
+
+		public void setThird(T3 third) {
+			this.third = third;
+		}
+		  
+		  
+		}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,68 @@
+package opennlp.tools.parse_thicket;
+
+public class WordWordInterSentenceRelationArc {
+	
+	
+		Pair<Integer, Integer> codeFrom;
+		Pair<Integer, Integer> codeTo;
+		String lemmaFrom;
+		String lemmaTo;
+		ArcType arcType;
+		
+		public Pair<Integer, Integer> getCodeFrom() {
+			return codeFrom;
+		}
+
+		public void setCodeFrom(Pair<Integer, Integer> codeFrom) {
+			this.codeFrom = codeFrom;
+		}
+
+		public Pair<Integer, Integer> getCodeTo() {
+			return codeTo;
+		}
+
+		public void setCodeTo(Pair<Integer, Integer> codeTo) {
+			this.codeTo = codeTo;
+		}
+
+		public String getLemmaFrom() {
+			return lemmaFrom;
+		}
+
+		public void setLemmaFrom(String lemmaFrom) {
+			this.lemmaFrom = lemmaFrom;
+		}
+
+		public String getLemmaTo() {
+			return lemmaTo;
+		}
+
+		public void setLemmaTo(String lemmaTo) {
+			this.lemmaTo = lemmaTo;
+		}
+
+		public ArcType getArcType() {
+			return arcType;
+		}
+
+		public void setArcType(ArcType arcType) {
+			this.arcType = arcType;
+		}
+
+		public WordWordInterSentenceRelationArc(
+				Pair<Integer, Integer> codeFrom, Pair<Integer, Integer> codeTo,
+				String lemmaFrom, String lemmaTo, ArcType arcType) {
+			super();
+			this.codeFrom = codeFrom;
+			this.codeTo = codeTo;
+			this.lemmaFrom = lemmaFrom;
+			this.lemmaTo = lemmaTo;
+			this.arcType = arcType;
+		}
+	
+		public String toString(){
+			return "<sent="+codeFrom.getFirst()+"-word="+codeFrom.getSecond()+".."+lemmaFrom+"> ===> "+
+					"<sent="+codeTo.getFirst()+"-word="+codeTo.getSecond()+".."+lemmaTo+">";
+		}
+
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+
+import org.apache.commons.lang.StringUtils;
+import org.json.JSONArray;
+import org.json.JSONObject;
+
+
+public class BingQueryRunnerMultipageSearchResults extends BingQueryRunner {
+	
+	private static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+	private static final Logger LOG = Logger
+		      .getLogger("opennlp.tools.similarity.apps.BingQueryRunnerMultipageSearchResults");
+	private AzureSearchWebQuery aq = new AzureSearchWebQuery();
+
+	public List<HitBase> runSearch(String query, int nRes, boolean bHighRank) {
+		aq.setAppid(BING_KEY);
+		aq.setQuery(query);		  		
+		aq.doQuery();
+		if (!bHighRank)
+			aq.setPage(5);
+		aq.setPerPage(nRes);
+		
+		List<HitBase> results = new ArrayList<HitBase> ();
+		AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+		
+		for (AzureSearchWebResult anr : ars){
+		    HitBase h = new HitBase();
+		    h.setAbstractText(anr.getDescription());
+		    h.setTitle(anr.getTitle());
+		    h.setUrl(anr.getUrl());
+		    results.add(h);
+		}
+		return results;
+	}
+	
+	
+
+
+}