You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC
svn commit: r1555944 [6/11] - in /opennlp/sandbox/opennlp-similarity/src: main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/ main/java/opennlp/tools/apps/contentgen/multithreaded/ main/java/opennlp/tools/apps/relevanceVocabs/ main/j...

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.textsimilarity.POSManager;
+
+
+public class ParseTreePathMatcherDeterministic {
+
+  private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
+
+  private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+  private POSManager posManager = new POSManager();
+
+  /**
+   * key matching function which takes two phrases, aligns them and finds a set
+   * of maximum common sub-phrase
+   * 
+   * @param chunk1
+   * @param chunk2
+   * @return
+   */
+
+  public List<ParseTreePath> generalizeTwoGroupedPhrasesDeterministic(
+      ParseTreePath chunk1, ParseTreePath chunk2) {
+    List<String> pos1 = chunk1.getPOSs();
+    List<String> pos2 = chunk2.getPOSs();
+    List<String> lem1 = chunk1.getLemmas();
+    List<String> lem2 = chunk2.getLemmas();
+
+    List<String> lem1stem = new ArrayList<String>();
+    List<String> lem2stem = new ArrayList<String>();
+
+    PorterStemmer ps = new PorterStemmer();
+    for (String word : lem1) {
+      try {
+        lem1stem.add(ps.stem(word.toLowerCase()).toString());
+      } catch (Exception e) {
+        // e.printStackTrace();
+
+        if (word.length() > 2)
+          System.err.println("Unable to stem: " + word);
+      }
+    }
+    try {
+      for (String word : lem2) {
+        lem2stem.add(ps.stem(word.toLowerCase()).toString());
+      }
+    } catch (Exception e) {
+      System.err.println("problem processing word " + lem2.toString());
+    }
+
+    List<String> overlap = new ArrayList(lem1stem);
+    overlap.retainAll(lem2stem);
+
+    if (overlap == null || overlap.size() < 1)
+      return null;
+
+    List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
+    for (String word : overlap) {
+      Integer i1 = lem1stem.indexOf(word);
+      Integer i2 = lem2stem.indexOf(word);
+      occur1.add(i1);
+      occur2.add(i2);
+    }
+
+    // now we search for plausible sublists of overlaps
+    // if at some position correspondence is inverse (one of two position
+    // decreases instead of increases)
+    // then we terminate current alignment accum and start a new one
+    List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
+    // starts from 1, not 0
+    List<int[]> accum = new ArrayList<int[]>();
+    accum.add(new int[] { occur1.get(0), occur2.get(0) });
+    for (int i = 1; i < occur1.size(); i++) {
+
+      if (occur1.get(i) > occur1.get(i - 1)
+          && occur2.get(i) > occur2.get(i - 1))
+        accum.add(new int[] { occur1.get(i), occur2.get(i) });
+      else {
+        overlapsPlaus.add(accum);
+        accum = new ArrayList<int[]>();
+        accum.add(new int[] { occur1.get(i), occur2.get(i) });
+      }
+    }
+    if (accum.size() > 0) {
+      overlapsPlaus.add(accum);
+    }
+
+    List<ParseTreePath> results = new ArrayList<ParseTreePath>();
+    for (List<int[]> occur : overlapsPlaus) {
+      List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
+      for (int[] column : occur) {
+        occr1.add(column[0]);
+        occr2.add(column[1]);
+      }
+
+      int ov1 = 0, ov2 = 0; // iterators over common words;
+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+      // we start two words before first word
+      int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
+      // if (k1<0) k1=0; if (k2<0) k2=0;
+      Boolean bReachedCommonWord = false;
+      while (k1 < 0 || k2 < 0) {
+        k1++;
+        k2++;
+      }
+      int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
+      while (k1 <= k1max && k2 <= k2max) {
+        // first check if the same POS
+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+        String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
+            lem2.get(k2), sim);
+        if ((sim != null)
+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+                .equals("fail")))) {
+          commonPOS.add(pos1.get(k1));
+          if (lemmaMatch != null) {
+            commonLemmas.add(lemmaMatch);
+            // System.out.println("Added "+lemmaMatch);
+            if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
+              bReachedCommonWord = true; // now we can have different increment
+                                         // opera
+            else {
+              if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
+                  && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
+                ov1++;
+                ov2++;
+                bReachedCommonWord = true;
+              }
+              // else
+              // System.err.println("Next match reached '"+lemmaMatch+
+              // "' | k1 - k2: "+k1 + " "+k2 +
+              // "| occur index ov1-ov2 "+
+              // ov1+" "+ov2+
+              // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
+              // +
+              // occr1.get(ov1) + " "+ occr2.get(ov1));
+            }
+          } else {
+            commonLemmas.add("*");
+          } // the same parts of speech, proceed to the next word in both
+            // expressions
+          k1++;
+          k2++;
+
+        } else if (!bReachedCommonWord) {
+          k1++;
+          k2++;
+        } // still searching
+        else {
+          // different parts of speech, jump to the next identified common word
+          ov1++;
+          ov2++;
+          if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
+            break;
+          // now trying to find
+          int kk1 = occr1.get(ov1) - 2, // new positions of iterators
+          kk2 = occr2.get(ov2) - 2;
+          int countMove = 0;
+          while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
+                                                                    // behind
+                                                                    // current
+                                                                    // position,
+                                                                    // synchroneously
+                                                                    // move
+                                                                    // towards
+                                                                    // right
+            kk1++;
+            kk2++;
+            countMove++;
+          }
+          k1 = kk1;
+          k2 = kk2;
+
+          if (k1 > k1max)
+            k1 = k1max;
+          if (k2 > k2max)
+            k2 = k2max;
+          bReachedCommonWord = false;
+        }
+      }
+      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,
+          0, 0);
+      results.add(currResult);
+    }
+
+    return results;
+  }
+
+  /**
+   * main function to generalize two expressions grouped by phrase types returns
+   * a list of generalizations for each phrase type with filtered
+   * sub-expressions
+   * 
+   * @param sent1
+   * @param sent2
+   * @return List<List<ParseTreeChunk>> list of list of POS-words pairs for each
+   *         resultant matched / overlapped phrase
+   */
+  public List<List<ParseTreePath>> matchTwoSentencesGroupedChunksDeterministic(
+      List<List<ParseTreePath>> sent1, List<List<ParseTreePath>> sent2) {
+    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();
+    // first iterate through component
+    for (int comp = 0; comp < 2 && // just np & vp
+        comp < sent1.size() && comp < sent2.size(); comp++) {
+      List<ParseTreePath> resultComps = new ArrayList<ParseTreePath>();
+      // then iterate through each phrase in each component
+      for (ParseTreePath ch1 : sent1.get(comp)) {
+        for (ParseTreePath ch2 : sent2.get(comp)) { // simpler version
+          List<ParseTreePath> chunkToAdd = generalizeTwoGroupedPhrasesDeterministic(
+              ch1, ch2);
+
+          if (chunkToAdd == null)
+            chunkToAdd = new ArrayList<ParseTreePath>();
+          // System.out.println("ch1 = "+
+          // ch1.toString()+" | ch2="+ch2.toString()
+          // +"\n result = "+chunkToAdd.toString() + "\n");
+          /*
+           * List<ParseTreeChunk> chunkToAdd1 =
+           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+           * ( ParseTreeMatcher.prepositionalNNSTransform(ch1), ch2); if
+           * (chunkToAdd1!=null) chunkToAdd.addAll(chunkToAdd1);
+           * List<ParseTreeChunk> chunkToAdd2 =
+           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+           * ( ParseTreeMatcher.prepositionalNNSTransform(ch2), ch1); if
+           * (chunkToAdd2!=null) chunkToAdd.addAll(chunkToAdd2);
+           */
+
+          // For generalized match not with orig sentences but with templates
+          // if (!LemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd))
+          // continue; // if the words which have to stay do not stay, proceed
+          // to other elements
+          Boolean alreadyThere = false;
+          for (ParseTreePath chunk : resultComps) {
+            if (chunkToAdd.contains(chunk)) {
+              alreadyThere = true;
+              break;
+            }
+
+            // }
+          }
+
+          if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {
+            resultComps.addAll(chunkToAdd);
+          }
+
+        }
+      }
+      List<ParseTreePath> resultCompsRed = generalizationListReducer
+          .applyFilteringBySubsumption(resultComps);
+
+      resultComps = resultCompsRed;
+      results.add(resultComps);
+    }
+
+    return results;
+  }
+
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,121 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+import org.jgrapht.Graph;
+import org.jgrapht.alg.BronKerboschCliqueFinder;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+public class EdgeProductBuilder {
+	private Matcher matcher = new Matcher();
+	private ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();
+	private GraphFromPTreeBuilder graphBuilder = new GraphFromPTreeBuilder();
+	
+	
+	public Graph<ParseGraphNode[], DefaultEdge>  
+		buildEdgeProduct(Graph<ParseGraphNode, DefaultEdge> g1, Graph<ParseGraphNode, DefaultEdge> g2 ){
+			Graph<ParseGraphNode[], DefaultEdge> gp = 
+				new SimpleGraph<ParseGraphNode[], DefaultEdge>(DefaultEdge.class);
+		
+		Set<DefaultEdge> edges1 = g1.edgeSet();
+		Set<DefaultEdge> edges2 = g2.edgeSet();
+		// build nodes of product graph
+		for(DefaultEdge e1:edges1){
+			for(DefaultEdge e2:edges2){
+				ParseGraphNode sourceE1s = g1.getEdgeSource(e1), sourceE1t = g1.getEdgeTarget(e1);
+				ParseGraphNode sourceE2s = g2.getEdgeSource(e2), sourceE2t = g2.getEdgeTarget(e2);
+				
+				if (isNotEmpty(matcher.generalize(sourceE1s.getPtNodes(), sourceE2s.getPtNodes())) && 
+						isNotEmpty(matcher.generalize(sourceE1t.getPtNodes(), sourceE2t.getPtNodes()))
+					)
+					gp.addVertex(new ParseGraphNode[] {sourceE1s, sourceE1t, sourceE2s, sourceE2t } );
+			}
+		}
+		
+		Set<ParseGraphNode[]> productVerticesSet = gp.vertexSet();
+		List<ParseGraphNode[]> productVerticesList = new ArrayList<ParseGraphNode[]>(productVerticesSet);
+		for(int i=0; i<productVerticesList.size(); i++){
+			for(int j=i+1; j<productVerticesList.size(); j++){
+				ParseGraphNode[] prodVertexI = productVerticesList.get(i);
+				ParseGraphNode[] prodVertexJ = productVerticesList.get(j);
+				if (bothAjacentOrNeitherAdjacent(prodVertexI, prodVertexJ)){
+					gp.addEdge(prodVertexI, prodVertexJ);
+				}
+			}
+		}
+		
+		
+		return gp;
+		
+	}
+	/*
+	 * Finding the maximal clique is the slowest part
+	 */
+	
+	public Collection<Set<ParseGraphNode[]>> getMaximalCommonSubgraphs(Graph<ParseGraphNode[], DefaultEdge>  g){
+		BronKerboschCliqueFinder<ParseGraphNode[], DefaultEdge> finder =
+	            new BronKerboschCliqueFinder<ParseGraphNode[], DefaultEdge>(g);
+
+	        Collection<Set<ParseGraphNode[]>> cliques = finder.getBiggestMaximalCliques();
+	        return cliques;
+	}
+
+
+	private boolean bothAjacentOrNeitherAdjacent(ParseGraphNode[] prodVertexI,
+			ParseGraphNode[] prodVertexJ) {
+		List<ParseGraphNode> prodVertexIlist = 
+				new ArrayList<ParseGraphNode>(Arrays.asList(prodVertexI));
+		List<ParseGraphNode> prodVertexJlist = 
+				new ArrayList<ParseGraphNode>(Arrays.asList(prodVertexJ));
+		prodVertexIlist.retainAll(prodVertexJlist);
+		return (prodVertexIlist.size()==2 || prodVertexIlist.size()==4);
+	}
+
+
+	private boolean isNotEmpty(List<List<ParseTreeChunk>> generalize) {
+		if (generalize!=null && generalize.get(0)!=null && generalize.get(0).size()>0)
+			return true;
+		else
+			return false;
+	}
+	
+	public Collection<Set<ParseGraphNode[]>>  assessRelevanceViaMaximalCommonSubgraphs(String para1, String para2) {
+		// first build PTs for each text
+		ParseThicket pt1 = ptBuilder.buildParseThicket(para1);
+		ParseThicket pt2 = ptBuilder.buildParseThicket(para2);
+		// then build phrases and rst arcs
+		Graph<ParseGraphNode, DefaultEdge> g1 = graphBuilder.buildGraphFromPT(pt1);
+		Graph<ParseGraphNode, DefaultEdge> g2 = graphBuilder.buildGraphFromPT(pt2);
+		
+		Graph<ParseGraphNode[], DefaultEdge> gp =  buildEdgeProduct(g1, g2);
+		Collection<Set<ParseGraphNode[]>> col = getMaximalCommonSubgraphs(gp);
+		return col;
+		}
+	
+	public static void main(String[] args){
+		 EdgeProductBuilder b = new  EdgeProductBuilder();
+		 Collection<Set<ParseGraphNode[]>> col = b.assessRelevanceViaMaximalCommonSubgraphs("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
+				"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
+				"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
+				"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "
+
+				, "Iran refuses the UN offer to end a conflict over its nuclear weapons."+
+						"UN passes a resolution prohibiting Iran from developing its uranium enrichment site. " +
+						"A recent UN report presented charts saying Iran was working on nuclear weapons. " +
+				"Iran envoy to UN states its nuclear development is for peaceful purpose, and the evidence against its claim is fabricated by the US. ");
+		System.out.print(col);
+	}
+}
+				

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,131 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.io.PrintWriter;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.PTTree;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import org.jgrapht.Graph;
+import org.jgrapht.graph.DefaultDirectedWeightedGraph;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+import edu.stanford.nlp.trees.LabeledScoredTreeNode;
+import edu.stanford.nlp.trees.Tree;
+
+public class GraphFromPTreeBuilder {
+	
+	
+	public Graph<ParseGraphNode, DefaultEdge> buildGraphFromPT(ParseThicket pt){
+		PrintWriter out = new PrintWriter(System.out);
+
+		
+		List<Tree> ts = pt.getSentences();
+		ts.get(0).pennPrint(out);
+		Graph<ParseGraphNode, DefaultEdge> gfragment = buildGGraphFromTree(ts.get(0));
+		
+		//ParseTreeVisualizer applet = new ParseTreeVisualizer();
+		//applet.showGraph(gfragment);
+		
+		return gfragment;
+		
+	}
+	
+	
+	private Graph<ParseGraphNode, DefaultEdge> buildGGraphFromTree(Tree tree) {
+		Graph<ParseGraphNode, DefaultEdge> g =
+				new SimpleGraph<ParseGraphNode, DefaultEdge>(DefaultEdge.class);
+		ParseGraphNode root = new ParseGraphNode(tree,"S 0");
+		g.addVertex(root);
+		navigate(tree, g, 0, root);
+	        
+		return g;
+	}
+
+
+
+	private void navigate(Tree tree, Graph<ParseGraphNode, DefaultEdge> g, int l, ParseGraphNode currParent) {
+		//String currParent = tree.label().value()+" $"+Integer.toString(l);
+		//g.addVertex(currParent);
+		if (tree.getChildrenAsList().size()==1)
+			navigate(tree.getChildrenAsList().get(0), g, l+1, currParent);
+		else
+			if (tree.getChildrenAsList().size()==0)
+				return;
+		
+		for(Tree child: tree.getChildrenAsList()){
+			String currChild = null;
+			ParseGraphNode currChildNode = null;
+			try {
+				if (child.isLeaf()) 
+					continue;
+				if (child.label().value().startsWith("S"))
+					navigate(child.getChildrenAsList().get(0), g, l+1, currParent);
+				
+				if (!child.isPhrasal() || child.isPreTerminal())
+					currChild = child.toString()+" #"+Integer.toString(l);
+				else 
+					currChild = child.label().value()+" #"+Integer.toString(l);
+				currChildNode = new ParseGraphNode(child, currChild);
+				g.addVertex(currChildNode);
+				g.addEdge(currParent, currChildNode);
+			} catch (Exception e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+			navigate(child, g, l+1, currChildNode);
+		}
+	}
+
+
+	/*
+	private static void navigateChildren(PTTree[] trChildren, int indent, boolean parentLabelNull, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+	    boolean firstSibling = true;
+	    boolean leftSibIsPreTerm = true;  // counts as true at beginning
+	    for (PTTree currentTree : trChildren) {
+	      currentTree.navigate(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, onlyLabelValue, phrases);
+	      leftSibIsPreTerm = currentTree.isPreTerminal();
+	      // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting
+	      if (currentTree.value() != null && currentTree.value().startsWith("CC")) {
+	        leftSibIsPreTerm = false;
+	      }
+	      firstSibling = false;
+	    }
+	  }
+	
+	
+	  private void navigate(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+	    // the condition for staying on the same line in Penn Treebank
+	    boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));
+	    if (suppressIndent) {
+	      //pw.print(" ");
+	      // pw.flush();
+	    } else {
+	      if (!topLevel) {
+	        //pw.println();
+	      }
+	      for (int i = 0; i < indent; i++) {
+	        //pw.print("  ");
+	        // pw.flush();
+	      }
+	    }
+	    if (isLeaf() || isPreTerminal()) {
+	      String terminalString = toStringBuilder(new StringBuilder(), onlyLabelValue).toString();
+	      //pw.print(terminalString);
+	      //pw.flush();
+	      return;
+	    }
+	    //pw.print("(");
+	    String nodeString = onlyLabelValue ? value() : nodeString();
+	    //pw.print(nodeString);
+	    // pw.flush();
+	    boolean parentIsNull = label() == null || label().value() == null;
+	    navigateChildren(children(), indent + 1, parentIsNull, true, phrases);
+	    //pw.print(")");
+	    
+	  }
+	  */
+
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,51 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.util.List;
+
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+
+
+import edu.stanford.nlp.trees.Tree;
+
+public class ParseGraphNode {
+	 PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
+	 
+	private Tree tree;
+	private String label;
+	private List<List<ParseTreeNode>> ptNodes;
+	
+	
+	
+	public List<List<ParseTreeNode>> getPtNodes() {
+		return ptNodes;
+	}
+
+	public ParseGraphNode(Tree tree, String label) {
+		super();
+		this.tree = tree;
+		this.label = label;
+		ptNodes =  phraseBuilder.buildPT2ptPhrasesForASentence(tree, null);
+	}
+
+	public Tree getTree() {
+		return tree;
+	}
+
+	public void setTree(Tree tree) {
+		this.tree = tree;
+	}
+
+	public String getLabel() {
+		return label;
+	}
+
+	public void setLabel(String label) {
+		this.label = label;
+	}
+
+	public String toString(){
+		return label;
+	}
+}
+	

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,194 @@
+/* ==========================================
+ * JGraphT : a free Java graph-theory library
+ * ==========================================
+ *
+ * Project Info:  http://jgrapht.sourceforge.net/
+ * Project Creator:  Barak Naveh (http://sourceforge.net/users/barak_naveh)
+ *
+ * (C) Copyright 2003-2008, by Barak Naveh and Contributors.
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+ */
+/* ----------------------
+ * JGraphAdapterDemo.java
+ * ----------------------
+ * (C) Copyright 2003-2008, by Barak Naveh and Contributors.
+ *
+ * Original Author:  Barak Naveh
+ * Contributor(s):   -
+ *
+ * $Id: JGraphAdapterDemo.java 725 2010-11-26 01:24:28Z perfecthash $
+ *
+ * Changes
+ * -------
+ * 03-Aug-2003 : Initial revision (BN);
+ * 07-Nov-2003 : Adaptation to JGraph 3.0 (BN);
+ *
+ */
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.awt.*;
+import java.awt.geom.*;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import javax.swing.*;
+
+
+import org.jgraph.*;
+import org.jgraph.graph.*;
+
+import org.jgrapht.*;
+import org.jgrapht.ext.*;
+import org.jgrapht.graph.*;
+
+
+import org.jgrapht.graph.DefaultEdge;
+
+public class ParseTreeVisualizer
+extends JApplet
+{
+	//~ Static fields/initializers ---------------------------------------------
+
+	private static final long serialVersionUID = 3256346823498765434L;
+	private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF");
+	private static final Dimension DEFAULT_SIZE = new Dimension(1200, 800);
+
+	//~ Instance fields --------------------------------------------------------
+
+	//
+	private JGraphModelAdapter<String, DefaultEdge> jgAdapter;
+
+	public void  showGraph(Graph g){
+		ParseTreeVisualizer applet = new ParseTreeVisualizer();
+		applet.importGraph(g);
+
+		JFrame frame = new JFrame();
+		frame.getContentPane().add(applet);
+		frame.setTitle("Showing parse thicket");
+		frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+		frame.pack();
+		frame.setVisible(true);
+	}
+
+	// TODO cast to ParseGraphNode
+	private void importGraph(Graph g) {
+		// create a visualization using JGraph, via an adapter
+		jgAdapter = new JGraphModelAdapter<String, DefaultEdge>(g);
+
+		JGraph jgraph = new JGraph(jgAdapter);
+
+		adjustDisplaySettings(jgraph);
+		getContentPane().add(jgraph);
+		resize(DEFAULT_SIZE);
+
+		Set<String> vertexSet = ( Set<String>)g.vertexSet();
+		int count=0;
+		Map<Integer, Integer> level_count = new HashMap<Integer, Integer> ();
+
+		for(String vertexStr: vertexSet){
+			Integer key = 0;
+			try {
+				if (vertexStr.indexOf('#')>-1)
+					key = Integer.parseInt(vertexStr.split("#")[1]);
+			} catch (Exception e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+			Integer howManyAlready = 0;
+
+			if (key>0){
+				 howManyAlready = level_count.get(key);
+				if (howManyAlready==null){
+					howManyAlready=0;
+					level_count.put(key, 1);
+				} else {
+					level_count.put(key, howManyAlready+1);
+				}
+			}
+			positionVertexAt(vertexStr, count+howManyAlready*50, count);
+			count+=20;
+		}
+
+
+	}
+
+	/**
+	 * An alternative starting point for this demo, to also allow running this
+	 * applet as an application.
+	 *
+	 * @param args ignored.
+	 */
+	public static void main(String [] args)
+	{
+		ParseTreeVisualizer applet = new ParseTreeVisualizer();
+		applet.init();
+
+		JFrame frame = new JFrame();
+		frame.getContentPane().add(applet);
+		frame.setTitle("JGraphT Adapter to JGraph Demo");
+		frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+		frame.pack();
+		frame.setVisible(true);
+	}
+
+
+
+	private void adjustDisplaySettings(JGraph jg)
+	{
+		jg.setPreferredSize(DEFAULT_SIZE);
+
+		Color c = DEFAULT_BG_COLOR;
+		String colorStr = null;
+
+		try {
+			colorStr = getParameter("bgcolor");
+		} catch (Exception e) {
+		}
+
+		if (colorStr != null) {
+			c = Color.decode(colorStr);
+		}
+
+		jg.setBackground(c);
+	}
+
+	@SuppressWarnings("unchecked") // FIXME hb 28-nov-05: See FIXME below
+	private void positionVertexAt(Object vertex, int x, int y)
+	{
+		DefaultGraphCell cell = jgAdapter.getVertexCell(vertex);
+		AttributeMap attr = cell.getAttributes();
+		Rectangle2D bounds = GraphConstants.getBounds(attr);
+
+		Rectangle2D newBounds =
+				new Rectangle2D.Double(
+						x,
+						y,
+						bounds.getWidth(),
+						bounds.getHeight());
+
+		GraphConstants.setBounds(attr, newBounds);
+
+		// TODO: Clean up generics once JGraph goes generic
+		AttributeMap cellAttr = new AttributeMap();
+		cellAttr.put(cell, attr);
+		jgAdapter.edit(cellAttr, null, null, null);
+	}
+
+}
+
+// End JGraphAdapterDemo.java

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,45 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class PhraseConcept {
+	int position;
+	//Set<Integer> intent;
+	List<List<ParseTreeChunk>> intent;
+	Set<Integer> parents;
+	public PhraseConcept() {
+		position = -1;
+		intent = new ArrayList<List<ParseTreeChunk>>();
+		parents = new HashSet<Integer>();
+	}
+	public void setPosition( int newPosition ){
+	       position = newPosition;
+	}
+	public void setIntent( List<List<ParseTreeChunk>> newIntent ){
+	       intent.clear();
+	       intent.addAll(newIntent);
+	}
+	public void setParents( Set<Integer> newParents ){
+	       //parents = newParents;
+		parents.clear();
+		parents.addAll(newParents);
+	}
+	public void printConcept() {
+		System.out.println("Concept position:" + position);
+		System.out.println("Concept intent:" + intent);
+		System.out.println("Concept parents:" + parents);
+	}
+	 public static void main(String []args) {
+		 PhraseConcept c = new PhraseConcept();
+		 c.printConcept();
+		 c.setPosition(10);
+		 c.printConcept();
+		 //List<List<ParseTreeChunk>> test = new List<List<ParseTreeChunk>>();
+		 //c.setIntent(test);
+		 c.printConcept();
+
+	 }
+}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,166 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+
+
+public class PhrasePatternStructure {
+	int objectCount;
+	int attributeCount;
+	ArrayList<PhraseConcept> conceptList;
+	ParseTreeMatcherDeterministic md; 
+	public PhrasePatternStructure(int objectCounts, int attributeCounts) {
+		objectCount = objectCounts;
+		attributeCount = attributeCounts;
+		conceptList = new ArrayList<PhraseConcept>();
+		PhraseConcept bottom = new PhraseConcept();
+		md = new ParseTreeMatcherDeterministic();
+		/*Set<Integer> b_intent = new HashSet<Integer>();
+		for (int index = 0; index < attributeCount; ++index) {
+			b_intent.add(index);
+		}
+		bottom.setIntent(b_intent);*/
+		bottom.setPosition(0);
+		conceptList.add(bottom);
+	}
+	public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int Generator) {
+		boolean parentIsMaximal = true;
+		while(parentIsMaximal) {
+			parentIsMaximal = false;
+			for (int parent : conceptList.get(Generator).parents) {
+				if (conceptList.get(parent).intent.containsAll(intent)) {
+					Generator = parent;
+					parentIsMaximal = true;
+					break;
+				}
+			}
+		}
+		return Generator;
+	}
+	public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) {
+		System.out.println("debug");
+		System.out.println("called for " + intent);
+		//printLattice();
+		int generator_tmp = GetMaximalConcept(intent, generator);
+		generator = generator_tmp;
+		if (conceptList.get(generator).intent.equals(intent)) {
+			System.out.println("at generator:" + conceptList.get(generator).intent);
+			System.out.println("to add:" + intent);
+
+			System.out.println("already generated");
+			return generator;
+		}
+		Set<Integer> generatorParents = conceptList.get(generator).parents;
+		Set<Integer> newParents = new HashSet<Integer>();
+		for (int candidate : generatorParents) {
+			if (!intent.containsAll(conceptList.get(candidate).intent)) {
+			//if (!conceptList.get(candidate).intent.containsAll(intent)) {
+				//Set<Integer> intersection = new HashSet<Integer>(conceptList.get(candidate).intent);
+				//List<List<ParseTreeChunk>> intersection = new ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent);
+				//intersection.retainAll(intent);
+				List<List<ParseTreeChunk>> intersection = md
+				.matchTwoSentencesGroupedChunksDeterministic(intent, conceptList.get(candidate).intent);
+				System.out.println("recursive call (inclusion)");
+				candidate = AddIntent(intersection, candidate);
+			}
+			boolean addParents = true;
+			System.out.println("now iterating over parents");
+			Iterator<Integer> iterator = newParents.iterator();
+			while (iterator.hasNext()) {
+				Integer parent = iterator.next();
+				if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+					addParents = false;
+					break;
+				}
+				else {
+					if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+						iterator.remove();
+					}
+				}
+			}
+			/*for (int parent : newParents) {
+				System.out.println("parent = " + parent);
+				System.out.println("candidate intent:"+conceptList.get(candidate).intent);
+				System.out.println("parent intent:"+conceptList.get(parent).intent);
+				
+				if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+					addParents = false;
+					break;
+				}
+				else {
+					if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+						newParents.remove(parent);
+					}
+				}
+			}*/
+			if (addParents) {
+				newParents.add(candidate);
+			}
+		}
+		System.out.println("size of lattice: " + conceptList.size());
+		PhraseConcept newConcept = new PhraseConcept();
+		newConcept.setIntent(intent);
+		newConcept.setPosition(conceptList.size());
+		conceptList.add(newConcept);
+		conceptList.get(generator).parents.add(newConcept.position);
+		for (int newParent: newParents) {
+			if (conceptList.get(generator).parents.contains(newParent)) {
+				conceptList.get(generator).parents.remove(newParent);
+			}
+			conceptList.get(newConcept.position).parents.add(newParent);
+		}
+		return newConcept.position;
+	}
+	public void printLatticeStats() {
+		System.out.println("Lattice stats");
+		System.out.println("max_object_index = " + objectCount);
+		System.out.println("max_attribute_index = " + attributeCount);
+		System.out.println("Current concept count = " + conceptList.size());
+	}
+	public void printLattice() {
+		for (int i = 0; i < conceptList.size(); ++i) {
+			printConceptByPosition(i);
+		}
+	}
+	public void printConceptByPosition(int index) {
+		System.out.println("Concept at position " + index);
+		conceptList.get(index).printConcept();
+	}
+	public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
+			List<List<ParseTreeNode>> phrs) {
+		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+		List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(), 
+				pps = new ArrayList<ParseTreeChunk>();
+		for(List<ParseTreeNode> ps:phrs){
+			ParseTreeChunk ch = convertNodeListIntoChunk(ps);
+			String ptype = ps.get(0).getPhraseType();
+			if (ptype.equals("NP")){
+				nps.add(ch);
+			} else if (ptype.equals("VP")){
+				vps.add(ch);
+			} else if (ptype.equals("PP")){
+				pps.add(ch);
+			}
+		}
+		results.add(nps); results.add(vps); results.add(pps);
+		return results;
+	}
+	private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) {
+		List<String> lemmas = new ArrayList<String>(),  poss = new ArrayList<String>();
+		for(ParseTreeNode n: ps){
+			lemmas.add(n.getWord());
+			poss.add(n.getPos());
+		}
+		ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
+		ch.setMainPOS(ps.get(0).getPhraseType());
+		return ch;
+	}
+	
+}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,162 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.parse_thicket.ArcType;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+
+import org.jgrapht.Graph;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+import edu.stanford.nlp.trees.Tree;
+
+public class RhetoricStructureArcsBuilder {
+	private RhetoricStructureMarker markerBuilderForSentence = new RhetoricStructureMarker();
+
+	private Map<Integer, List<Pair<String, Integer[]>>> buildMarkers(ParseThicket pt){
+
+		Map<Integer, List<Pair<String, Integer[]>>> sentNumMarkers = new 
+				HashMap<Integer, List<Pair<String, Integer[]>>>();
+		int count = 0;
+		for( List<ParseTreeNode> sent: pt.getNodesThicket()){
+			List<Pair<String, Integer[]>> markersForSentence = markerBuilderForSentence.
+					extractRSTrelationInSentenceGetBoundarySpan(sent);
+			sentNumMarkers.put(count,  markersForSentence);
+			count++;
+		}
+		return sentNumMarkers;
+	}
+
+
+	/*
+	 * Induced RST algorithm
+	 * 
+	 * Input: obtained RST markers (numbers of words which 
+	 * splits sentence in potential RST relation arguments) +
+	 * Current Parse Thicket with arcs for coreferences
+	 * 
+	 * We search for parts of sentences on the opposite side of RST markers
+	 * 
+	 * $sentPosFrom$  marker
+	 *  | == == == [ ] == == == |
+	 *     \				\
+	 *       \				  \
+	 *       coref          RST arc being formed
+	 *           \ 				\
+	 *             \			 \
+	 *     | == == == == == [  ] == == ==|      
+	 *     
+	 *       Mark yelled at his dog, but it disobeyed
+	 *        |							\
+	 *       coref                 RST arc for CONTRAST being formed
+	 *        | 							\
+	 *       He was upset, however he did not show it
+	 *       $sentPosTo$
+	 */
+	public List<WordWordInterSentenceRelationArc> buildRSTArcsFromMarkersAndCorefs(
+			List<WordWordInterSentenceRelationArc> arcs,
+			Map<Integer, List<List<ParseTreeNode>>> sentNumPhrasesMap, 
+			ParseThicket pt ) {
+		List<WordWordInterSentenceRelationArc> arcsRST = new ArrayList<WordWordInterSentenceRelationArc>();		
+
+		Map<Integer, List<Pair<String, Integer[]>>> rstMarkersMap = buildMarkers(pt);
+
+		for(int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){
+			for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){
+				// for given arc, find phrases connected by this arc and add to the list of phrases
+
+				List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);
+				List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);
+				List<Pair<String, Integer[]>> markersFrom = rstMarkersMap.get(nSentFrom);
+				List<Pair<String, Integer[]>> markersTo = rstMarkersMap.get(nSentTo);
+				for(WordWordInterSentenceRelationArc arc: arcs){
+					// arc should be coref and link these sentences
+					if (nSentFrom != arc.getCodeFrom().getFirst() ||
+							nSentTo != arc.getCodeTo().getFirst() ||
+							!arc.getArcType().getType().startsWith("coref")
+							)
+						continue;
+					int sentPosFrom = arc.getCodeFrom().getSecond();
+					int sentPosTo = arc.getCodeTo().getSecond();
+					// not more than a single RST link for a pair of sentences
+					boolean bFound = false;
+					for(List<ParseTreeNode> vpFrom: phrasesFrom){
+						if (bFound)
+							break;
+						for(List<ParseTreeNode> vpTo: phrasesTo){
+							for(Pair<String, Integer[]> mFrom: markersFrom){
+								for(Pair<String, Integer[]> mTo: markersTo) {
+									{
+										// the phrases should be on an opposite side of rst marker for a coref link
+										if (isSequence( new Integer[] { sentPosFrom,  vpFrom.get(0).getId(), mFrom.getSecond()[0]}) &&
+												isSequence( new Integer[] { sentPosTo,  vpTo.get(0).getId(), mTo.getSecond()[0]})	){
+											ArcType arcType = new ArcType("rst", mFrom.getFirst(), 0, 0);
+
+											WordWordInterSentenceRelationArc arcRST = 
+													new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(nSentFrom, mFrom.getSecond()[1]), 
+															new Pair<Integer, Integer>(nSentTo, mTo.getSecond()[1]), "", "", arcType);
+											arcsRST.add(arcRST);
+											bFound = true;
+											break;
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+
+		return arcs;
+	}
+
+// check if the word positions occur in sentence in the order Integer[]
+// TODO make more sensitive algo	
+	private static boolean isSequence(Integer[] integers) {
+		//TODO better construction of array
+		if (integers==null || integers.length<3)
+			return false;
+		try {
+			for(Integer i: integers)
+				if (i==0)
+					return false;
+		} catch (Exception e) {
+			return false;
+		}
+		
+		Boolean bWrongOrder = false;
+		for(int i=1; i< integers.length; i++){
+			if (integers[i-1]>integers[i]){
+				bWrongOrder = true;
+				break;
+			}
+		}
+		
+		Boolean bWrongInverseOrder = false;
+		for(int i=1; i< integers.length; i++){
+			if (integers[i-1]<integers[i]){
+				bWrongInverseOrder = true;
+				break;
+			}
+		}
+		
+		return !(bWrongOrder && bWrongInverseOrder);
+	}
+
+
+
+	public static void main(String[] args){
+
+
+	}
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,129 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+
+
+public class RhetoricStructureMarker implements IGeneralizer<Integer[]>  {
+	//private static String rstRelations[] = {"antithesis", "concession", "contrast", "elaboration"};
+	List<Pair<String, ParseTreeNode[]>> rstMarkers = new ArrayList<Pair<String, ParseTreeNode[]>>();
+
+	public  RhetoricStructureMarker(){
+
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>("contrast", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("than",",")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "antithesis", new ParseTreeNode[]{new ParseTreeNode("although",","),  new ParseTreeNode("*","*")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("however","*")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("however","*"), new ParseTreeNode(",",","),
+					new ParseTreeNode("*","prp"),   }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("*","NN")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode("as","*"),  new ParseTreeNode("a","*")  }));
+	
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>("explanation", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("because",",")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "example", new ParseTreeNode[]{new ParseTreeNode("for","IN"),  new ParseTreeNode("example","NN")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("ye","*")  }));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode(",",","),
+					new ParseTreeNode("*","prp"),   }));
+		
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode("i","*"),
+				  }));
+		
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "explanation", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("where","*")  }));
+		//as long as
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","RB"), 
+				new ParseTreeNode("as","IN"),}));
+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","VB*"), 
+				new ParseTreeNode("until","IN"),}));
+
+	}
+
+	/* For a sentence, we obtain a list of markers with the CA word and position in the sentence
+	 * Output span is an integer array with start/end occurrence of an RST marker in a sentence
+	 * */
+	public List<Pair<String, Integer[]>> extractRSTrelationInSentenceGetBoundarySpan(List<ParseTreeNode> sentence){
+		List<Pair<String, Integer[]>> results = new ArrayList<Pair<String, Integer[]>> ();
+		
+		for(Pair<String, ParseTreeNode[]> template: rstMarkers){
+			List<Integer[]> spanList = generalize(sentence,template.getSecond() );
+			if (!spanList.isEmpty())
+				results.add(new Pair<String, Integer[]>(template.getFirst(), spanList.get(0)));
+		}
+		return results;
+	}
+
+	/* Rule application in the form of generalization
+	 * Generalizing a sentence with a rule (a template), we obtain the occurrence of rhetoric marker
+	 *
+	 * o1 - sentence
+	 * o2 - rule/template, specifying lemmas and/or POS, including punctuation
+	 * @see opennlp.tools.parse_thicket.IGeneralizer#generalize(java.lang.Object, java.lang.Object)
+	 * returns the span Integer[] 
+	 */
+	@Override
+	public List<Integer[]> generalize(Object o1, Object o2) {
+		List<Integer[]> result = new ArrayList<Integer[]>();
+
+		List<ParseTreeNode> sentence = (List<ParseTreeNode> )o1;
+		ParseTreeNode[] template = (ParseTreeNode[]) o2;
+
+		boolean bBeingMatched = false;
+		for(int wordIndexInSentence=0; wordIndexInSentence<sentence.size(); wordIndexInSentence++){
+			ParseTreeNode word = sentence.get(wordIndexInSentence);
+			int wordIndexInSentenceEnd = wordIndexInSentence; //init iterators for internal loop
+			int templateIterator=0;
+			while (wordIndexInSentenceEnd<sentence.size() && templateIterator< template.length){
+				ParseTreeNode tword = template[templateIterator];
+				ParseTreeNode currWord=sentence.get(wordIndexInSentenceEnd);
+				List<ParseTreeNode> gRes = tword.generalize(tword, currWord);
+				if (gRes.isEmpty()|| gRes.get(0)==null || ( gRes.get(0).getWord().equals("*") 
+						&& gRes.get(0).getPos().equals("*") )){
+					bBeingMatched = false;
+					break;
+				} else {
+					bBeingMatched = true;
+				}
+				wordIndexInSentenceEnd++;
+				templateIterator++;
+			}
+			// template iteration is done
+			// the only condition for successful match is IF we are at the end of template
+			if (templateIterator == template.length){
+				result.add(new Integer[]{wordIndexInSentence, wordIndexInSentenceEnd-1});
+				return result;
+			}
+
+			// no match for current sentence word: proceed to the next
+		}
+		return result; 
+	}
+	
+	public String markerToString(List<Pair<String, Integer[]>> res){
+		StringBuffer buf = new StringBuffer();
+		buf.append("[");
+		for(Pair<String, Integer[]> marker: res){
+			buf.append(marker.getFirst()+":");
+			for(int a: marker.getSecond()){
+				buf.append(a+" ");
+			}
+			buf.append (" | ");
+		}
+		buf.append("]");
+		return buf.toString();
+	}
+
+	public static void main(String[] args){
+		ParseTreeNode[] sent = 	
+		new ParseTreeNode[]{new ParseTreeNode("he","prn"), new ParseTreeNode("was","vbz"), new ParseTreeNode("more","jj"), 
+				new ParseTreeNode(",",","),  new ParseTreeNode("than",","), new ParseTreeNode("little","jj"), new ParseTreeNode("boy","nn"),
+				new ParseTreeNode(",",","), new ParseTreeNode("however","*"), new ParseTreeNode(",",","),
+				new ParseTreeNode("he","prp"), new ParseTreeNode("was","vbz"), new ParseTreeNode("adult","jj")
+		};
+		
+		List<Pair<String, Integer[]>> res = new RhetoricStructureMarker().extractRSTrelationInSentenceGetBoundarySpan(Arrays.asList(sent));
+		System.out.println( new RhetoricStructureMarker().markerToString(res));
+	} 
+}

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java Mon Jan  6 17:48:30 2014
@@ -17,28 +17,90 @@
 
 package opennlp.tools.similarity.apps;
 
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.net.URLConnection;
-import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Logger;
 
-import org.apache.commons.lang.StringUtils;
-import org.json.JSONArray;
-import org.json.JSONObject;
+import net.billylieurance.azuresearch.AzureSearchImageQuery;
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
 
 public class BingQueryRunner {
-  protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
-    //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
-  // TODO user needs to have own APP_ID from Bing API
+	
+	protected static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+	private static final Logger LOG = Logger
+		      .getLogger("opennlp.tools.similarity.apps.BingQueryRunner");
+	protected AzureSearchWebQuery aq = new AzureSearchWebQuery();
+	private AzureSearchImageQuery iq = new AzureSearchImageQuery();
+	
+	public void setKey(String key){
+		BING_KEY = key;
+	}
+	
+	public void setLang(String language){
+		aq.setMarket(language);
+	}
+  
+	public List<HitBase> runSearch(String query, int nRes) {
+		aq.setAppid(BING_KEY);
+		aq.setQuery(query);		
+		aq.setPerPage(nRes);
+		try {
+			aq.doQuery();
+		} catch (Exception e) { // most likely exception is due to limit on bing key
+			aq.setAppid("pjtCgujmf9TtfjCVBdcQ2rBUQwGLmtLtgCG4Ex7kekw");
+			try {
+				aq.doQuery();
+			} catch (Exception e1) {
+				// TODO Auto-generated catch block
+				e1.printStackTrace();
+			}
+			e.printStackTrace();
+		}
+		
+		//org.xml.sax.SAXParseException
+		
+		List<HitBase> results = new ArrayList<HitBase> ();
+		AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+		
+		for (AzureSearchWebResult anr : ars){
+		    HitBase h = new HitBase();
+		    h.setAbstractText(anr.getDescription());
+		    h.setTitle(anr.getTitle());
+		    h.setUrl(anr.getUrl());
+		    results.add(h);
+		}
+		return results;
+	}
+	
+	
+	public AzureSearchResultSet<AzureSearchImageResult> runImageSearch(String query) {
+		iq.setAppid(BING_KEY);
+		iq.setQuery(query);		
+		iq.doQuery();
+		
+		AzureSearchResultSet<AzureSearchImageResult> ars = iq.getQueryResult();
+
+		return ars;
+	}
+	public int getTotalPagesAtASite(String site)
+	{
+		return runSearch("site:"+site, 1000000).size();
+	}
+	
+
+	public List<HitBase> runSearch(String query) {
+		return runSearch(query, 100);
+	}	
+	
+	
+	
 
   private float snapshotSimilarityThreshold = 0.4f;
 
-  private static final Logger LOG = Logger
-      .getLogger("opennlp.tools.similarity.apps.BingQueryRunner");
+  
 
   public void setSnapshotSimilarityThreshold(float thr) {
     snapshotSimilarityThreshold = thr;
@@ -53,8 +115,7 @@ public class BingQueryRunner {
   }
 
   /*
-   * 
-   */
+ 
 
   private String constructBingUrl(String query, String domainWeb, String lang,
       int numbOfHits) throws Exception {
@@ -73,9 +134,8 @@ public class BingQueryRunner {
     return yahooRequest;
   }
 
-  /*
-     *  
-     */
+ 
+    
   public ArrayList<String> search(String query, String domainWeb, String lang,
       int numbOfHits) throws Exception {
     URL url = new URL(constructBingUrl(query, domainWeb, lang, numbOfHits));
@@ -145,6 +205,7 @@ public class BingQueryRunner {
     hits = HitBase.removeDuplicates(hits);
     return hits;
   }
+  */
 
   // TODO comment back when dependencies resolved (CopyrightViolations)
   /*
@@ -185,10 +246,16 @@ public class BingQueryRunner {
 
   public static void main(String[] args) {
     BingQueryRunner self = new BingQueryRunner();
+    
+    AzureSearchResultSet<AzureSearchImageResult> res = self.runImageSearch("albert einstein");
+    System.out.println(res);
     try {
+    	self.setLang("es-MX");
+    	self.setKey(
+    			"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=");
       List<HitBase> resp = self
-          .runSearch("Rates rise at weekly Treasury auction");
-      // "British Actress Lynn Redgrave dies at 67");
+          .runSearch(//"art scene");
+        		  "biomecanica las palancas");
       System.out.print(resp.get(0));
     } catch (Exception e) {
       // TODO Auto-generated catch block
@@ -196,6 +263,12 @@ public class BingQueryRunner {
     }
 
     /*
+     * 
+     * de-DE
+     * es-MX
+     * es-SP
+     */
+    /*
      * String[] submittedNews = new String[]{
      * "Asian airports had already increased security following the Christmas Day attack, but South Korea and Pakistan are thinking about additional measures."
      * ,

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,467 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.parse_thicket.Triple;
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+/*
+ * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine 
+ * them in the form 
+ * expected to be readable by humans and not distinguishable from genuine content by search engines
+ * 
+ */
+
+public class ContentGenerator /*extends RelatedSentenceFinder*/ {
+	private static Logger LOG = Logger
+			.getLogger("opennlp.tools.similarity.apps.ContentGenerator");
+	PageFetcher pFetcher = new PageFetcher();
+	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
+			.getInstance();
+	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
+	protected BingQueryRunner yrunner = new BingQueryRunner();
+	protected ContentGeneratorSupport support = new ContentGeneratorSupport();
+	protected int MAX_STEPS = 1;
+	protected int MAX_SEARCH_RESULTS = 1;
+	protected float RELEVANCE_THRESHOLD = 1.1f;
+
+	//private static final int MAX_FRAGMENT_SENTS = 10;
+
+	public ContentGenerator(int ms, int msr, float thresh, String key) {
+		this.MAX_STEPS = ms;
+		this.MAX_SEARCH_RESULTS = msr;
+		this.RELEVANCE_THRESHOLD=thresh;
+		yrunner.setKey(key);
+	}
+
+	public ContentGenerator() {
+		// TODO Auto-generated constructor stub
+	}
+	public void setLang(String lang) {
+		yrunner.setLang(lang);
+
+	}
+
+
+	/**
+	 * Main content generation function which takes a seed as a person, rock
+	 * group, or other entity name and produce a list of text fragments by web
+	 * mining for <br>
+	 * 
+	 * @param String
+	 *          entity name
+	 * @return List<HitBase> of text fragment structures which contain approved
+	 *         (in terms of relevance) mined sentences, as well as original search
+	 *         results objects such as doc titles, abstracts, and urls.
+	 */
+
+	public List<HitBase> generateContentAbout(String sentence) throws Exception {
+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+		System.out.println(" \n=== Entity to write about = " + sentence);
+	
+		int stepCount=0;
+		for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
+			List<HitBase> searchResult = yrunner.runSearch(sentence + " "
+					+ verbAddition, MAX_SEARCH_RESULTS); //100);
+			if (MAX_SEARCH_RESULTS<searchResult.size())
+				searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
+			//TODO for shorter run
+			if (searchResult != null) {
+				for (HitBase item : searchResult) { // got some text from .html
+					if (item.getAbstractText() != null
+							&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
+						opinionSentencesToAdd
+						.add(buildParagraphOfGeneratedText(item, sentence, null));
+					}
+				}
+			}
+			stepCount++;
+			if (stepCount>MAX_STEPS)
+				break;
+		}
+
+		opinionSentencesToAdd = ContentGeneratorSupport.removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+		return opinionSentencesToAdd;
+	}
+
+	/**
+	 * Takes a sentence and extracts noun phrases and entity names to from search
+	 * queries for finding relevant sentences on the web, which are then subject
+	 * to relevance assessment by Similarity. Search queries should not be too
+	 * general (irrelevant search results) or too specific (too few search
+	 * results)
+	 * 
+	 * @param String
+	 *          input sentence to form queries
+	 * @return List<String> of search expressions
+	 */
+	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
+		ParseTreeChunk matcher = new ParseTreeChunk();
+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
+				.getInstance();
+		List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+		List<ParseTreeChunk> nPhrases = pos
+				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
+		List<String> queryArrayStr = new ArrayList<String>();
+		for (ParseTreeChunk ch : nPhrases) {
+			String query = "";
+			int size = ch.getLemmas().size();
+
+			for (int i = 0; i < size; i++) {
+				if (ch.getPOSs().get(i).startsWith("N")
+						|| ch.getPOSs().get(i).startsWith("J")) {
+					query += ch.getLemmas().get(i) + " ";
+				}
+			}
+			query = query.trim();
+			int len = query.split(" ").length;
+			if (len < 2 || len > 5)
+				continue;
+			if (len < 4) { // every word should start with capital
+				String[] qs = query.split(" ");
+				boolean bAccept = true;
+				for (String w : qs) {
+					if (w.toLowerCase().equals(w)) // idf only two words then
+						// has to be person name,
+						// title or geo location
+						bAccept = false;
+				}
+				if (!bAccept)
+					continue;
+			}
+
+			query = query.trim().replace(" ", " +");
+			query = " +" + query;
+
+			queryArrayStr.add(query);
+
+		}
+		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
+			// keywords
+			for (ParseTreeChunk ch : nPhrases) {
+				String query = "";
+				int size = ch.getLemmas().size();
+
+				for (int i = 0; i < size; i++) {
+					if (ch.getPOSs().get(i).startsWith("N")
+							|| ch.getPOSs().get(i).startsWith("J")) {
+						query += ch.getLemmas().get(i) + " ";
+					}
+				}
+				query = query.trim();
+				int len = query.split(" ").length;
+				if (len < 2)
+					continue;
+
+				query = query.trim().replace(" ", " +");
+				query = " +" + query;
+
+				queryArrayStr.add(query);
+
+			}
+		}
+
+		queryArrayStr = ContentGeneratorSupport.removeDuplicatesFromQueries(queryArrayStr);
+		queryArrayStr.add(sentence);
+
+		return queryArrayStr;
+
+	}
+
+	private Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
+		if (sentsAll == null)
+			sentsAll = new ArrayList<String>();
+		// put orig sentence in structure
+		List<String> origs = new ArrayList<String>();
+		origs.add(originalSentence);
+		item.setOriginalSentences(origs);
+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+				.replace("  ", " ").replace("  ", " ");
+		// generation results for this sentence
+		List<Fragment> result = new ArrayList<Fragment>();
+		// form plain text from snippet
+		String snapshot = item.getAbstractText().replace("<b>", " ")
+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+
+
+		// fix a template expression which can be substituted by original if
+		// relevant
+		String snapshotMarked = snapshot.replace("...",
+				" _should_find_orig_ . _should_find_orig_");
+		String[] fragments = sm.splitSentences(snapshotMarked);
+		List<String> allFragms = new ArrayList<String>();
+		allFragms.addAll(Arrays.asList(fragments));
+
+		String[] sents = null;
+		String downloadedPage = null;
+		try {
+			if (snapshotMarked.length() != snapshot.length()) {
+				downloadedPage = pFetcher.fetchPage(item.getUrl());
+				if (downloadedPage != null && downloadedPage.length() > 100) {
+					item.setPageContent(downloadedPage);
+					String pageContent = Utils.fullStripHTML(item.getPageContent());
+					pageContent = GeneratedSentenceProcessor
+							.normalizeForSentenceSplitting(pageContent);
+					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
+			
+					sents = sm.splitSentences(pageContent);
+
+					sents = ContentGeneratorSupport.cleanListOfSents(sents);
+				}
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			// e.printStackTrace();
+			System.err
+			.println("Problem downloading  the page and splitting into sentences");
+			return new Triple(allFragms, downloadedPage, sents);
+		}
+		return new Triple(allFragms, downloadedPage, sents);
+	}
+
+	private String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
+		String[] mainAndFollowSent = null;
+
+		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+		String downloadedPage = (String)fragmentExtractionResults.getSecond();
+		String[] sents = (String[])fragmentExtractionResults.getThird();
+
+		String followSent = null;
+		if (fragment.length() < 50)
+			return null;
+		String pageSentence = "";
+		// try to find original sentence from webpage
+		if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+				&& sents.length > 0){
+			try { 
+				// first try sorted sentences from page by length approach
+				String[] sentsSortedByLength = support.extractSentencesFromPage(downloadedPage);
+
+
+				try {
+					mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
+							fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+				} catch (Exception e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+				// if the above gives null than try to match all sentences from snippet fragment
+				if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+					mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
+							fragment.replace("_should_find_orig_", ""), sents);
+				}
+
+
+			} catch (Exception e) {
+
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		}
+		else
+			// or get original snippet
+			pageSentence = fragment;
+		if (pageSentence != null)
+			pageSentence.replace("_should_find_orig_", "");
+
+		return mainAndFollowSent;
+
+	}	
+
+	private Fragment verifyCandidateSentencesAndFormParagraph(
+			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
+		Fragment result = null;	
+
+		String pageSentence = candidateSentences[0];
+		String followSent = "";
+		for(int i = 1; i< candidateSentences.length; i++)
+			followSent+= candidateSentences[i];
+		String title = item.getTitle();
+
+		// resultant sentence SHOULD NOT be longer than for times the size of
+		// snippet fragment
+		if (!(pageSentence != null && pageSentence.length()>50 
+				&& (float) pageSentence.length() / (float) fragment.length() < 4.0) )
+			return null;
+
+
+		try { // get score from syntactic match between sentence in
+			// original text and mined sentence
+			double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+					+ " " + title, originalSentence);
+			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+				System.out
+				.println("Rejected Sentence : No verb OR Yes imperative verb :"
+						+ pageSentence);
+				return null;
+			}
+
+			syntScore = parseTreeChunkListScorer
+					.getParseTreeChunkListScore(match);
+			System.out.println(parseTreeChunk.listToString(match) + " "
+					+ syntScore + "\n pre-processed sent = '" + pageSentence);
+
+			if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+				for (String currSent : sentsAll) {
+					if (currSent.startsWith(originalSentence))
+						continue;
+					match = sm.assessRelevance(currSent, pageSentence)
+							.getMatchResult();
+					double syntScoreCurr = parseTreeChunkListScorer
+							.getParseTreeChunkListScore(match);
+					if (syntScoreCurr > syntScore) {
+						syntScore = syntScoreCurr;
+					}
+				}
+				if (syntScore > RELEVANCE_THRESHOLD) {
+					System.out.println("Got match with other sent: "
+							+ parseTreeChunk.listToString(match) + " " + syntScore);
+				}
+			}
+
+			measScore = stringDistanceMeasurer.measureStringDistance(
+					originalSentence, pageSentence);
+
+
+			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
+					&& measScore < 0.8 && pageSentence.length() > 40) // >70
+			{
+				String pageSentenceProc = GeneratedSentenceProcessor
+						.acceptableMinedSentence(pageSentence);
+				if (pageSentenceProc != null) {
+					pageSentenceProc = GeneratedSentenceProcessor
+							.processSentence(pageSentenceProc);
+					followSent = GeneratedSentenceProcessor.processSentence(followSent);
+					if (followSent != null) {
+						pageSentenceProc += " "+ followSent;
+					}
+
+					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+					result = new Fragment(pageSentenceProc, syntScore + measScore
+							+ mentalScore + (double) pageSentenceProc.length()
+							/ (double) 50);
+					result.setSourceURL(item.getUrl());
+					result.fragment = fragment;
+
+					System.out.println("Accepted sentence: " + pageSentenceProc
+							+ "| with title= " + title);
+					System.out.println("For fragment = " + fragment);
+				} else
+					System.out
+					.println("Rejected sentence due to wrong area at webpage: "
+							+ pageSentence);
+			} else
+				System.out.println("Rejected sentence due to low score: "
+						+ pageSentence);
+			// }
+		} catch (Throwable t) {
+			t.printStackTrace();
+		}
+
+	return result;
+}
+	/**
+	 * Takes single search result for an entity which is the subject of the essay
+	 * to be written and forms essey sentences from the title, abstract, and
+	 * possibly original page
+	 * 
+	 * @param HitBase
+	 *          item : search result
+	 * @param originalSentence
+	 *          : seed for the essay to be written
+	 * @param sentsAll
+	 *          : list<String> of other sentences in the seed if it is
+	 *          multi-sentence
+	 * @return search result
+	 */
+	public HitBase buildParagraphOfGeneratedText(HitBase item,
+			String originalSentence, List<String> sentsAll) {
+		List<Fragment> results = new ArrayList<Fragment>() ;
+		
+		Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);
+
+		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+		String downloadedPage = (String)fragmentExtractionResults.getSecond();
+		String[] sents = (String[])fragmentExtractionResults.getThird();
+
+		for (String fragment : allFragms) {
+			String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
+			if (candidateSentences == null)
+				continue;
+			Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
+			if (res!=null)
+				results.add(res);
+
+		}
+		
+		item.setFragments(results );
+		return item;
+	}
+
+
+
+
+public static void main(String[] args) {
+	ContentGenerator f = new ContentGenerator();
+
+	List<HitBase> hits = null;
+	try {
+		// uncomment the sentence you would like to serve as a seed sentence for
+		// content generation for an event description
+
+		// uncomment the sentence you would like to serve as a seed sentence for
+		// content generation for an event description
+		hits = f.generateContentAbout("Albert Einstein"
+				// "Britney Spears - The Femme Fatale Tour"
+				// "Rush Time Machine",
+				// "Blue Man Group" ,
+				// "Belly Dance With Zaharah",
+				// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+				// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+				);
+		System.out.println(HitBase.toString(hits));
+		System.out.println(HitBase.toResultantString(hits));
+		// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
+		// hits.get(0).getTitle(), hits);
+
+	} catch (Exception e) {
+		e.printStackTrace();
+	}
+
+}
+
+
+
+}
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps;
+
+import java.util.List;
+
+import javax.mail.internet.AddressException;
+import javax.mail.internet.InternetAddress;
+
+import opennlp.tools.apps.utils.email.EmailSender;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+public class ContentGeneratorRunner {
+	public static void main(String[] args) {
+		ParserChunker2MatcherProcessor sm = null;
+	    	    
+	    try {
+			String resourceDir = args[2];
+			if (resourceDir!=null)
+				sm = ParserChunker2MatcherProcessor.getInstance(resourceDir);
+			else
+				sm = ParserChunker2MatcherProcessor.getInstance();
+	
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	    
+	    String bingKey = args[7];
+	    if (bingKey == null){
+	    	bingKey = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+	    }
+	    
+	    RelatedSentenceFinder f = null;
+	    String lang = args[6];
+	    if (lang.startsWith("es")){
+	    	f = new RelatedSentenceFinderML(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);
+	    	f.setLang(lang);
+	    } else	    
+	    
+		    if (args.length>4 && args[4]!=null)
+		    	f = new RelatedSentenceFinder(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);
+		    else
+		    	f = new RelatedSentenceFinder();
+		    
+	    List<HitBase> hits = null;
+	    try {
+	      
+	      hits = f.generateContentAbout(args[0].replace('+', ' ').replace('"', ' ').trim());
+	      System.out.println(HitBase.toString(hits));
+	      String generatedContent = HitBase.toResultantString(hits);
+	      
+	      opennlp.tools.apps.utils.email.EmailSender s = new opennlp.tools.apps.utils.email.EmailSender();
+			
+			try {
+				s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "borgalor", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, 
+						"Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);
+			} catch (AddressException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} catch (Exception e) {
+		
+				e.printStackTrace();
+				try {
+					s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "borgalor", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, 
+							"Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);
+				} catch (Exception e1) {
+					// TODO Auto-generated catch block
+					e1.printStackTrace();
+				}
+			}
+	      
+	      
+	    } catch (Exception e) {
+	      e.printStackTrace();
+	    }
+
+	  }
+}
+
+/*
+ * C:\stanford-corenlp>java -Xmx1g -jar pt.jar albert+einstein bgalitsky@hotmail.com C:/stanford-corenlp/src/test/resources
+ * 
+ * http://173.255.254.250:8983/solr/contentgen/?q=albert+einstein&email=bgalitsky@hotmail.com&resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&stepsNum=20&searchResultsNum=100&relevanceThreshold=0.5&lang=es-US&bingKey=e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=
+ */