You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC
svn commit: r1555944 [6/11] - in /opennlp/sandbox/opennlp-similarity/src:
main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/
main/java/opennlp/tools/apps/contentgen/multithreaded/
main/java/opennlp/tools/apps/relevanceVocabs/ main/j...
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.textsimilarity.POSManager;
+
+
+public class ParseTreePathMatcherDeterministic {
+
+ private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
+
+ private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+ private POSManager posManager = new POSManager();
+
+ /**
+ * key matching function which takes two phrases, aligns them and finds a set
+ * of maximum common sub-phrase
+ *
+ * @param chunk1
+ * @param chunk2
+ * @return
+ */
+
+ public List<ParseTreePath> generalizeTwoGroupedPhrasesDeterministic(
+ ParseTreePath chunk1, ParseTreePath chunk2) {
+ List<String> pos1 = chunk1.getPOSs();
+ List<String> pos2 = chunk2.getPOSs();
+ List<String> lem1 = chunk1.getLemmas();
+ List<String> lem2 = chunk2.getLemmas();
+
+ List<String> lem1stem = new ArrayList<String>();
+ List<String> lem2stem = new ArrayList<String>();
+
+ PorterStemmer ps = new PorterStemmer();
+ for (String word : lem1) {
+ try {
+ lem1stem.add(ps.stem(word.toLowerCase()).toString());
+ } catch (Exception e) {
+ // e.printStackTrace();
+
+ if (word.length() > 2)
+ System.err.println("Unable to stem: " + word);
+ }
+ }
+ try {
+ for (String word : lem2) {
+ lem2stem.add(ps.stem(word.toLowerCase()).toString());
+ }
+ } catch (Exception e) {
+ System.err.println("problem processing word " + lem2.toString());
+ }
+
+ List<String> overlap = new ArrayList(lem1stem);
+ overlap.retainAll(lem2stem);
+
+ if (overlap == null || overlap.size() < 1)
+ return null;
+
+ List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
+ for (String word : overlap) {
+ Integer i1 = lem1stem.indexOf(word);
+ Integer i2 = lem2stem.indexOf(word);
+ occur1.add(i1);
+ occur2.add(i2);
+ }
+
+ // now we search for plausible sublists of overlaps
+ // if at some position correspondence is inverse (one of two position
+ // decreases instead of increases)
+ // then we terminate current alignment accum and start a new one
+ List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
+ // starts from 1, not 0
+ List<int[]> accum = new ArrayList<int[]>();
+ accum.add(new int[] { occur1.get(0), occur2.get(0) });
+ for (int i = 1; i < occur1.size(); i++) {
+
+ if (occur1.get(i) > occur1.get(i - 1)
+ && occur2.get(i) > occur2.get(i - 1))
+ accum.add(new int[] { occur1.get(i), occur2.get(i) });
+ else {
+ overlapsPlaus.add(accum);
+ accum = new ArrayList<int[]>();
+ accum.add(new int[] { occur1.get(i), occur2.get(i) });
+ }
+ }
+ if (accum.size() > 0) {
+ overlapsPlaus.add(accum);
+ }
+
+ List<ParseTreePath> results = new ArrayList<ParseTreePath>();
+ for (List<int[]> occur : overlapsPlaus) {
+ List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
+ for (int[] column : occur) {
+ occr1.add(column[0]);
+ occr2.add(column[1]);
+ }
+
+ int ov1 = 0, ov2 = 0; // iterators over common words;
+ List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+ // we start two words before first word
+ int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
+ // if (k1<0) k1=0; if (k2<0) k2=0;
+ Boolean bReachedCommonWord = false;
+ while (k1 < 0 || k2 < 0) {
+ k1++;
+ k2++;
+ }
+ int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
+ while (k1 <= k1max && k2 <= k2max) {
+ // first check if the same POS
+ String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+ String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
+ lem2.get(k2), sim);
+ if ((sim != null)
+ && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+ .equals("fail")))) {
+ commonPOS.add(pos1.get(k1));
+ if (lemmaMatch != null) {
+ commonLemmas.add(lemmaMatch);
+ // System.out.println("Added "+lemmaMatch);
+ if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
+ bReachedCommonWord = true; // now we can have different increment
+ // opera
+ else {
+ if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
+ && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
+ ov1++;
+ ov2++;
+ bReachedCommonWord = true;
+ }
+ // else
+ // System.err.println("Next match reached '"+lemmaMatch+
+ // "' | k1 - k2: "+k1 + " "+k2 +
+ // "| occur index ov1-ov2 "+
+ // ov1+" "+ov2+
+ // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
+ // +
+ // occr1.get(ov1) + " "+ occr2.get(ov1));
+ }
+ } else {
+ commonLemmas.add("*");
+ } // the same parts of speech, proceed to the next word in both
+ // expressions
+ k1++;
+ k2++;
+
+ } else if (!bReachedCommonWord) {
+ k1++;
+ k2++;
+ } // still searching
+ else {
+ // different parts of speech, jump to the next identified common word
+ ov1++;
+ ov2++;
+ if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
+ break;
+ // now trying to find
+ int kk1 = occr1.get(ov1) - 2, // new positions of iterators
+ kk2 = occr2.get(ov2) - 2;
+ int countMove = 0;
+ while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
+ // behind
+ // current
+ // position,
+ // synchroneously
+ // move
+ // towards
+ // right
+ kk1++;
+ kk2++;
+ countMove++;
+ }
+ k1 = kk1;
+ k2 = kk2;
+
+ if (k1 > k1max)
+ k1 = k1max;
+ if (k2 > k2max)
+ k2 = k2max;
+ bReachedCommonWord = false;
+ }
+ }
+ ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,
+ 0, 0);
+ results.add(currResult);
+ }
+
+ return results;
+ }
+
+ /**
+ * main function to generalize two expressions grouped by phrase types returns
+ * a list of generalizations for each phrase type with filtered
+ * sub-expressions
+ *
+ * @param sent1
+ * @param sent2
+ * @return List<List<ParseTreeChunk>> list of list of POS-words pairs for each
+ * resultant matched / overlapped phrase
+ */
+ public List<List<ParseTreePath>> matchTwoSentencesGroupedChunksDeterministic(
+ List<List<ParseTreePath>> sent1, List<List<ParseTreePath>> sent2) {
+ List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();
+ // first iterate through component
+ for (int comp = 0; comp < 2 && // just np & vp
+ comp < sent1.size() && comp < sent2.size(); comp++) {
+ List<ParseTreePath> resultComps = new ArrayList<ParseTreePath>();
+ // then iterate through each phrase in each component
+ for (ParseTreePath ch1 : sent1.get(comp)) {
+ for (ParseTreePath ch2 : sent2.get(comp)) { // simpler version
+ List<ParseTreePath> chunkToAdd = generalizeTwoGroupedPhrasesDeterministic(
+ ch1, ch2);
+
+ if (chunkToAdd == null)
+ chunkToAdd = new ArrayList<ParseTreePath>();
+ // System.out.println("ch1 = "+
+ // ch1.toString()+" | ch2="+ch2.toString()
+ // +"\n result = "+chunkToAdd.toString() + "\n");
+ /*
+ * List<ParseTreeChunk> chunkToAdd1 =
+ * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+ * ( ParseTreeMatcher.prepositionalNNSTransform(ch1), ch2); if
+ * (chunkToAdd1!=null) chunkToAdd.addAll(chunkToAdd1);
+ * List<ParseTreeChunk> chunkToAdd2 =
+ * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+ * ( ParseTreeMatcher.prepositionalNNSTransform(ch2), ch1); if
+ * (chunkToAdd2!=null) chunkToAdd.addAll(chunkToAdd2);
+ */
+
+ // For generalized match not with orig sentences but with templates
+ // if (!LemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd))
+ // continue; // if the words which have to stay do not stay, proceed
+ // to other elements
+ Boolean alreadyThere = false;
+ for (ParseTreePath chunk : resultComps) {
+ if (chunkToAdd.contains(chunk)) {
+ alreadyThere = true;
+ break;
+ }
+
+ // }
+ }
+
+ if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {
+ resultComps.addAll(chunkToAdd);
+ }
+
+ }
+ }
+ List<ParseTreePath> resultCompsRed = generalizationListReducer
+ .applyFilteringBySubsumption(resultComps);
+
+ resultComps = resultCompsRed;
+ results.add(resultComps);
+ }
+
+ return results;
+ }
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,121 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+import org.jgrapht.Graph;
+import org.jgrapht.alg.BronKerboschCliqueFinder;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+public class EdgeProductBuilder {
+ private Matcher matcher = new Matcher();
+ private ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();
+ private GraphFromPTreeBuilder graphBuilder = new GraphFromPTreeBuilder();
+
+
+ public Graph<ParseGraphNode[], DefaultEdge>
+ buildEdgeProduct(Graph<ParseGraphNode, DefaultEdge> g1, Graph<ParseGraphNode, DefaultEdge> g2 ){
+ Graph<ParseGraphNode[], DefaultEdge> gp =
+ new SimpleGraph<ParseGraphNode[], DefaultEdge>(DefaultEdge.class);
+
+ Set<DefaultEdge> edges1 = g1.edgeSet();
+ Set<DefaultEdge> edges2 = g2.edgeSet();
+ // build nodes of product graph
+ for(DefaultEdge e1:edges1){
+ for(DefaultEdge e2:edges2){
+ ParseGraphNode sourceE1s = g1.getEdgeSource(e1), sourceE1t = g1.getEdgeTarget(e1);
+ ParseGraphNode sourceE2s = g2.getEdgeSource(e2), sourceE2t = g2.getEdgeTarget(e2);
+
+ if (isNotEmpty(matcher.generalize(sourceE1s.getPtNodes(), sourceE2s.getPtNodes())) &&
+ isNotEmpty(matcher.generalize(sourceE1t.getPtNodes(), sourceE2t.getPtNodes()))
+ )
+ gp.addVertex(new ParseGraphNode[] {sourceE1s, sourceE1t, sourceE2s, sourceE2t } );
+ }
+ }
+
+ Set<ParseGraphNode[]> productVerticesSet = gp.vertexSet();
+ List<ParseGraphNode[]> productVerticesList = new ArrayList<ParseGraphNode[]>(productVerticesSet);
+ for(int i=0; i<productVerticesList.size(); i++){
+ for(int j=i+1; j<productVerticesList.size(); j++){
+ ParseGraphNode[] prodVertexI = productVerticesList.get(i);
+ ParseGraphNode[] prodVertexJ = productVerticesList.get(j);
+ if (bothAjacentOrNeitherAdjacent(prodVertexI, prodVertexJ)){
+ gp.addEdge(prodVertexI, prodVertexJ);
+ }
+ }
+ }
+
+
+ return gp;
+
+ }
+ /*
+ * Finding the maximal clique is the slowest part
+ */
+
+ public Collection<Set<ParseGraphNode[]>> getMaximalCommonSubgraphs(Graph<ParseGraphNode[], DefaultEdge> g){
+ BronKerboschCliqueFinder<ParseGraphNode[], DefaultEdge> finder =
+ new BronKerboschCliqueFinder<ParseGraphNode[], DefaultEdge>(g);
+
+ Collection<Set<ParseGraphNode[]>> cliques = finder.getBiggestMaximalCliques();
+ return cliques;
+ }
+
+
+ private boolean bothAjacentOrNeitherAdjacent(ParseGraphNode[] prodVertexI,
+ ParseGraphNode[] prodVertexJ) {
+ List<ParseGraphNode> prodVertexIlist =
+ new ArrayList<ParseGraphNode>(Arrays.asList(prodVertexI));
+ List<ParseGraphNode> prodVertexJlist =
+ new ArrayList<ParseGraphNode>(Arrays.asList(prodVertexJ));
+ prodVertexIlist.retainAll(prodVertexJlist);
+ return (prodVertexIlist.size()==2 || prodVertexIlist.size()==4);
+ }
+
+
+ private boolean isNotEmpty(List<List<ParseTreeChunk>> generalize) {
+ if (generalize!=null && generalize.get(0)!=null && generalize.get(0).size()>0)
+ return true;
+ else
+ return false;
+ }
+
+ public Collection<Set<ParseGraphNode[]>> assessRelevanceViaMaximalCommonSubgraphs(String para1, String para2) {
+ // first build PTs for each text
+ ParseThicket pt1 = ptBuilder.buildParseThicket(para1);
+ ParseThicket pt2 = ptBuilder.buildParseThicket(para2);
+ // then build phrases and rst arcs
+ Graph<ParseGraphNode, DefaultEdge> g1 = graphBuilder.buildGraphFromPT(pt1);
+ Graph<ParseGraphNode, DefaultEdge> g2 = graphBuilder.buildGraphFromPT(pt2);
+
+ Graph<ParseGraphNode[], DefaultEdge> gp = buildEdgeProduct(g1, g2);
+ Collection<Set<ParseGraphNode[]>> col = getMaximalCommonSubgraphs(gp);
+ return col;
+ }
+
+ public static void main(String[] args){
+ EdgeProductBuilder b = new EdgeProductBuilder();
+ Collection<Set<ParseGraphNode[]>> col = b.assessRelevanceViaMaximalCommonSubgraphs("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
+ "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
+ "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
+ "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "
+
+ , "Iran refuses the UN offer to end a conflict over its nuclear weapons."+
+ "UN passes a resolution prohibiting Iran from developing its uranium enrichment site. " +
+ "A recent UN report presented charts saying Iran was working on nuclear weapons. " +
+ "Iran envoy to UN states its nuclear development is for peaceful purpose, and the evidence against its claim is fabricated by the US. ");
+ System.out.print(col);
+ }
+}
+
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,131 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.io.PrintWriter;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.PTTree;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import org.jgrapht.Graph;
+import org.jgrapht.graph.DefaultDirectedWeightedGraph;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+import edu.stanford.nlp.trees.LabeledScoredTreeNode;
+import edu.stanford.nlp.trees.Tree;
+
+public class GraphFromPTreeBuilder {
+
+
+ public Graph<ParseGraphNode, DefaultEdge> buildGraphFromPT(ParseThicket pt){
+ PrintWriter out = new PrintWriter(System.out);
+
+
+ List<Tree> ts = pt.getSentences();
+ ts.get(0).pennPrint(out);
+ Graph<ParseGraphNode, DefaultEdge> gfragment = buildGGraphFromTree(ts.get(0));
+
+ //ParseTreeVisualizer applet = new ParseTreeVisualizer();
+ //applet.showGraph(gfragment);
+
+ return gfragment;
+
+ }
+
+
+ private Graph<ParseGraphNode, DefaultEdge> buildGGraphFromTree(Tree tree) {
+ Graph<ParseGraphNode, DefaultEdge> g =
+ new SimpleGraph<ParseGraphNode, DefaultEdge>(DefaultEdge.class);
+ ParseGraphNode root = new ParseGraphNode(tree,"S 0");
+ g.addVertex(root);
+ navigate(tree, g, 0, root);
+
+ return g;
+ }
+
+
+
+ private void navigate(Tree tree, Graph<ParseGraphNode, DefaultEdge> g, int l, ParseGraphNode currParent) {
+ //String currParent = tree.label().value()+" $"+Integer.toString(l);
+ //g.addVertex(currParent);
+ if (tree.getChildrenAsList().size()==1)
+ navigate(tree.getChildrenAsList().get(0), g, l+1, currParent);
+ else
+ if (tree.getChildrenAsList().size()==0)
+ return;
+
+ for(Tree child: tree.getChildrenAsList()){
+ String currChild = null;
+ ParseGraphNode currChildNode = null;
+ try {
+ if (child.isLeaf())
+ continue;
+ if (child.label().value().startsWith("S"))
+ navigate(child.getChildrenAsList().get(0), g, l+1, currParent);
+
+ if (!child.isPhrasal() || child.isPreTerminal())
+ currChild = child.toString()+" #"+Integer.toString(l);
+ else
+ currChild = child.label().value()+" #"+Integer.toString(l);
+ currChildNode = new ParseGraphNode(child, currChild);
+ g.addVertex(currChildNode);
+ g.addEdge(currParent, currChildNode);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ navigate(child, g, l+1, currChildNode);
+ }
+ }
+
+
+ /*
+ private static void navigateChildren(PTTree[] trChildren, int indent, boolean parentLabelNull, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+ boolean firstSibling = true;
+ boolean leftSibIsPreTerm = true; // counts as true at beginning
+ for (PTTree currentTree : trChildren) {
+ currentTree.navigate(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, onlyLabelValue, phrases);
+ leftSibIsPreTerm = currentTree.isPreTerminal();
+ // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting
+ if (currentTree.value() != null && currentTree.value().startsWith("CC")) {
+ leftSibIsPreTerm = false;
+ }
+ firstSibling = false;
+ }
+ }
+
+
+ private void navigate(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {
+ // the condition for staying on the same line in Penn Treebank
+ boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));
+ if (suppressIndent) {
+ //pw.print(" ");
+ // pw.flush();
+ } else {
+ if (!topLevel) {
+ //pw.println();
+ }
+ for (int i = 0; i < indent; i++) {
+ //pw.print(" ");
+ // pw.flush();
+ }
+ }
+ if (isLeaf() || isPreTerminal()) {
+ String terminalString = toStringBuilder(new StringBuilder(), onlyLabelValue).toString();
+ //pw.print(terminalString);
+ //pw.flush();
+ return;
+ }
+ //pw.print("(");
+ String nodeString = onlyLabelValue ? value() : nodeString();
+ //pw.print(nodeString);
+ // pw.flush();
+ boolean parentIsNull = label() == null || label().value() == null;
+ navigateChildren(children(), indent + 1, parentIsNull, true, phrases);
+ //pw.print(")");
+
+ }
+ */
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,51 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.util.List;
+
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+
+
+import edu.stanford.nlp.trees.Tree;
+
+public class ParseGraphNode {
+ PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
+
+ private Tree tree;
+ private String label;
+ private List<List<ParseTreeNode>> ptNodes;
+
+
+
+ public List<List<ParseTreeNode>> getPtNodes() {
+ return ptNodes;
+ }
+
+ public ParseGraphNode(Tree tree, String label) {
+ super();
+ this.tree = tree;
+ this.label = label;
+ ptNodes = phraseBuilder.buildPT2ptPhrasesForASentence(tree, null);
+ }
+
+ public Tree getTree() {
+ return tree;
+ }
+
+ public void setTree(Tree tree) {
+ this.tree = tree;
+ }
+
+ public String getLabel() {
+ return label;
+ }
+
+ public void setLabel(String label) {
+ this.label = label;
+ }
+
+ public String toString(){
+ return label;
+ }
+}
+
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,194 @@
+/* ==========================================
+ * JGraphT : a free Java graph-theory library
+ * ==========================================
+ *
+ * Project Info: http://jgrapht.sourceforge.net/
+ * Project Creator: Barak Naveh (http://sourceforge.net/users/barak_naveh)
+ *
+ * (C) Copyright 2003-2008, by Barak Naveh and Contributors.
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+ */
+/* ----------------------
+ * JGraphAdapterDemo.java
+ * ----------------------
+ * (C) Copyright 2003-2008, by Barak Naveh and Contributors.
+ *
+ * Original Author: Barak Naveh
+ * Contributor(s): -
+ *
+ * $Id: JGraphAdapterDemo.java 725 2010-11-26 01:24:28Z perfecthash $
+ *
+ * Changes
+ * -------
+ * 03-Aug-2003 : Initial revision (BN);
+ * 07-Nov-2003 : Adaptation to JGraph 3.0 (BN);
+ *
+ */
+package opennlp.tools.parse_thicket.parse_thicket2graph;
+
+import java.awt.*;
+import java.awt.geom.*;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import javax.swing.*;
+
+
+import org.jgraph.*;
+import org.jgraph.graph.*;
+
+import org.jgrapht.*;
+import org.jgrapht.ext.*;
+import org.jgrapht.graph.*;
+
+
+import org.jgrapht.graph.DefaultEdge;
+
+public class ParseTreeVisualizer
+extends JApplet
+{
+ //~ Static fields/initializers ---------------------------------------------
+
+ private static final long serialVersionUID = 3256346823498765434L;
+ private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF");
+ private static final Dimension DEFAULT_SIZE = new Dimension(1200, 800);
+
+ //~ Instance fields --------------------------------------------------------
+
+ //
+ private JGraphModelAdapter<String, DefaultEdge> jgAdapter;
+
+ public void showGraph(Graph g){
+ ParseTreeVisualizer applet = new ParseTreeVisualizer();
+ applet.importGraph(g);
+
+ JFrame frame = new JFrame();
+ frame.getContentPane().add(applet);
+ frame.setTitle("Showing parse thicket");
+ frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+ frame.pack();
+ frame.setVisible(true);
+ }
+
+ // TODO cast to ParseGraphNode
+ private void importGraph(Graph g) {
+ // create a visualization using JGraph, via an adapter
+ jgAdapter = new JGraphModelAdapter<String, DefaultEdge>(g);
+
+ JGraph jgraph = new JGraph(jgAdapter);
+
+ adjustDisplaySettings(jgraph);
+ getContentPane().add(jgraph);
+ resize(DEFAULT_SIZE);
+
+ Set<String> vertexSet = ( Set<String>)g.vertexSet();
+ int count=0;
+ Map<Integer, Integer> level_count = new HashMap<Integer, Integer> ();
+
+ for(String vertexStr: vertexSet){
+ Integer key = 0;
+ try {
+ if (vertexStr.indexOf('#')>-1)
+ key = Integer.parseInt(vertexStr.split("#")[1]);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ Integer howManyAlready = 0;
+
+ if (key>0){
+ howManyAlready = level_count.get(key);
+ if (howManyAlready==null){
+ howManyAlready=0;
+ level_count.put(key, 1);
+ } else {
+ level_count.put(key, howManyAlready+1);
+ }
+ }
+ positionVertexAt(vertexStr, count+howManyAlready*50, count);
+ count+=20;
+ }
+
+
+ }
+
+ /**
+ * An alternative starting point for this demo, to also allow running this
+ * applet as an application.
+ *
+ * @param args ignored.
+ */
+ public static void main(String [] args)
+ {
+ ParseTreeVisualizer applet = new ParseTreeVisualizer();
+ applet.init();
+
+ JFrame frame = new JFrame();
+ frame.getContentPane().add(applet);
+ frame.setTitle("JGraphT Adapter to JGraph Demo");
+ frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+ frame.pack();
+ frame.setVisible(true);
+ }
+
+
+
+ private void adjustDisplaySettings(JGraph jg)
+ {
+ jg.setPreferredSize(DEFAULT_SIZE);
+
+ Color c = DEFAULT_BG_COLOR;
+ String colorStr = null;
+
+ try {
+ colorStr = getParameter("bgcolor");
+ } catch (Exception e) {
+ }
+
+ if (colorStr != null) {
+ c = Color.decode(colorStr);
+ }
+
+ jg.setBackground(c);
+ }
+
+ @SuppressWarnings("unchecked") // FIXME hb 28-nov-05: See FIXME below
+ private void positionVertexAt(Object vertex, int x, int y)
+ {
+ DefaultGraphCell cell = jgAdapter.getVertexCell(vertex);
+ AttributeMap attr = cell.getAttributes();
+ Rectangle2D bounds = GraphConstants.getBounds(attr);
+
+ Rectangle2D newBounds =
+ new Rectangle2D.Double(
+ x,
+ y,
+ bounds.getWidth(),
+ bounds.getHeight());
+
+ GraphConstants.setBounds(attr, newBounds);
+
+ // TODO: Clean up generics once JGraph goes generic
+ AttributeMap cellAttr = new AttributeMap();
+ cellAttr.put(cell, attr);
+ jgAdapter.edit(cellAttr, null, null, null);
+ }
+
+}
+
+// End JGraphAdapterDemo.java
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,45 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class PhraseConcept {
+ int position;
+ //Set<Integer> intent;
+ List<List<ParseTreeChunk>> intent;
+ Set<Integer> parents;
+ public PhraseConcept() {
+ position = -1;
+ intent = new ArrayList<List<ParseTreeChunk>>();
+ parents = new HashSet<Integer>();
+ }
+ public void setPosition( int newPosition ){
+ position = newPosition;
+ }
+ public void setIntent( List<List<ParseTreeChunk>> newIntent ){
+ intent.clear();
+ intent.addAll(newIntent);
+ }
+ public void setParents( Set<Integer> newParents ){
+ //parents = newParents;
+ parents.clear();
+ parents.addAll(newParents);
+ }
+ public void printConcept() {
+ System.out.println("Concept position:" + position);
+ System.out.println("Concept intent:" + intent);
+ System.out.println("Concept parents:" + parents);
+ }
+ public static void main(String []args) {
+ PhraseConcept c = new PhraseConcept();
+ c.printConcept();
+ c.setPosition(10);
+ c.printConcept();
+ //List<List<ParseTreeChunk>> test = new List<List<ParseTreeChunk>>();
+ //c.setIntent(test);
+ c.printConcept();
+
+ }
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,166 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+
+
+public class PhrasePatternStructure {
+ int objectCount;
+ int attributeCount;
+ ArrayList<PhraseConcept> conceptList;
+ ParseTreeMatcherDeterministic md;
+ public PhrasePatternStructure(int objectCounts, int attributeCounts) {
+ objectCount = objectCounts;
+ attributeCount = attributeCounts;
+ conceptList = new ArrayList<PhraseConcept>();
+ PhraseConcept bottom = new PhraseConcept();
+ md = new ParseTreeMatcherDeterministic();
+ /*Set<Integer> b_intent = new HashSet<Integer>();
+ for (int index = 0; index < attributeCount; ++index) {
+ b_intent.add(index);
+ }
+ bottom.setIntent(b_intent);*/
+ bottom.setPosition(0);
+ conceptList.add(bottom);
+ }
+ public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int Generator) {
+ boolean parentIsMaximal = true;
+ while(parentIsMaximal) {
+ parentIsMaximal = false;
+ for (int parent : conceptList.get(Generator).parents) {
+ if (conceptList.get(parent).intent.containsAll(intent)) {
+ Generator = parent;
+ parentIsMaximal = true;
+ break;
+ }
+ }
+ }
+ return Generator;
+ }
+ public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) {
+ System.out.println("debug");
+ System.out.println("called for " + intent);
+ //printLattice();
+ int generator_tmp = GetMaximalConcept(intent, generator);
+ generator = generator_tmp;
+ if (conceptList.get(generator).intent.equals(intent)) {
+ System.out.println("at generator:" + conceptList.get(generator).intent);
+ System.out.println("to add:" + intent);
+
+ System.out.println("already generated");
+ return generator;
+ }
+ Set<Integer> generatorParents = conceptList.get(generator).parents;
+ Set<Integer> newParents = new HashSet<Integer>();
+ for (int candidate : generatorParents) {
+ if (!intent.containsAll(conceptList.get(candidate).intent)) {
+ //if (!conceptList.get(candidate).intent.containsAll(intent)) {
+ //Set<Integer> intersection = new HashSet<Integer>(conceptList.get(candidate).intent);
+ //List<List<ParseTreeChunk>> intersection = new ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent);
+ //intersection.retainAll(intent);
+ List<List<ParseTreeChunk>> intersection = md
+ .matchTwoSentencesGroupedChunksDeterministic(intent, conceptList.get(candidate).intent);
+ System.out.println("recursive call (inclusion)");
+ candidate = AddIntent(intersection, candidate);
+ }
+ boolean addParents = true;
+ System.out.println("now iterating over parents");
+ Iterator<Integer> iterator = newParents.iterator();
+ while (iterator.hasNext()) {
+ Integer parent = iterator.next();
+ if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+ addParents = false;
+ break;
+ }
+ else {
+ if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+ iterator.remove();
+ }
+ }
+ }
+ /*for (int parent : newParents) {
+ System.out.println("parent = " + parent);
+ System.out.println("candidate intent:"+conceptList.get(candidate).intent);
+ System.out.println("parent intent:"+conceptList.get(parent).intent);
+
+ if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+ addParents = false;
+ break;
+ }
+ else {
+ if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+ newParents.remove(parent);
+ }
+ }
+ }*/
+ if (addParents) {
+ newParents.add(candidate);
+ }
+ }
+ System.out.println("size of lattice: " + conceptList.size());
+ PhraseConcept newConcept = new PhraseConcept();
+ newConcept.setIntent(intent);
+ newConcept.setPosition(conceptList.size());
+ conceptList.add(newConcept);
+ conceptList.get(generator).parents.add(newConcept.position);
+ for (int newParent: newParents) {
+ if (conceptList.get(generator).parents.contains(newParent)) {
+ conceptList.get(generator).parents.remove(newParent);
+ }
+ conceptList.get(newConcept.position).parents.add(newParent);
+ }
+ return newConcept.position;
+ }
+ public void printLatticeStats() {
+ System.out.println("Lattice stats");
+ System.out.println("max_object_index = " + objectCount);
+ System.out.println("max_attribute_index = " + attributeCount);
+ System.out.println("Current concept count = " + conceptList.size());
+ }
+ public void printLattice() {
+ for (int i = 0; i < conceptList.size(); ++i) {
+ printConceptByPosition(i);
+ }
+ }
+ public void printConceptByPosition(int index) {
+ System.out.println("Concept at position " + index);
+ conceptList.get(index).printConcept();
+ }
+ public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
+ List<List<ParseTreeNode>> phrs) {
+ List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+ List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(),
+ pps = new ArrayList<ParseTreeChunk>();
+ for(List<ParseTreeNode> ps:phrs){
+ ParseTreeChunk ch = convertNodeListIntoChunk(ps);
+ String ptype = ps.get(0).getPhraseType();
+ if (ptype.equals("NP")){
+ nps.add(ch);
+ } else if (ptype.equals("VP")){
+ vps.add(ch);
+ } else if (ptype.equals("PP")){
+ pps.add(ch);
+ }
+ }
+ results.add(nps); results.add(vps); results.add(pps);
+ return results;
+ }
+ private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) {
+ List<String> lemmas = new ArrayList<String>(), poss = new ArrayList<String>();
+ for(ParseTreeNode n: ps){
+ lemmas.add(n.getWord());
+ poss.add(n.getPos());
+ }
+ ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
+ ch.setMainPOS(ps.get(0).getPhraseType());
+ return ch;
+ }
+
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,162 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.parse_thicket.ArcType;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+
+import org.jgrapht.Graph;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+import edu.stanford.nlp.trees.Tree;
+
+public class RhetoricStructureArcsBuilder {
+ private RhetoricStructureMarker markerBuilderForSentence = new RhetoricStructureMarker();
+
+ private Map<Integer, List<Pair<String, Integer[]>>> buildMarkers(ParseThicket pt){
+
+ Map<Integer, List<Pair<String, Integer[]>>> sentNumMarkers = new
+ HashMap<Integer, List<Pair<String, Integer[]>>>();
+ int count = 0;
+ for( List<ParseTreeNode> sent: pt.getNodesThicket()){
+ List<Pair<String, Integer[]>> markersForSentence = markerBuilderForSentence.
+ extractRSTrelationInSentenceGetBoundarySpan(sent);
+ sentNumMarkers.put(count, markersForSentence);
+ count++;
+ }
+ return sentNumMarkers;
+ }
+
+
+ /*
+ * Induced RST algorithm
+ *
+ * Input: obtained RST markers (numbers of words which
+ * splits sentence in potential RST relation arguments) +
+ * Current Parse Thicket with arcs for coreferences
+ *
+ * We search for parts of sentences on the opposite side of RST markers
+ *
+ * $sentPosFrom$ marker
+ * | == == == [ ] == == == |
+ * \ \
+ * \ \
+ * coref RST arc being formed
+ * \ \
+ * \ \
+ * | == == == == == [ ] == == ==|
+ *
+ * Mark yelled at his dog, but it disobeyed
+ * | \
+ * coref RST arc for CONTRAST being formed
+ * | \
+ * He was upset, however he did not show it
+ * $sentPosTo$
+ */
+ public List<WordWordInterSentenceRelationArc> buildRSTArcsFromMarkersAndCorefs(
+ List<WordWordInterSentenceRelationArc> arcs,
+ Map<Integer, List<List<ParseTreeNode>>> sentNumPhrasesMap,
+ ParseThicket pt ) {
+ List<WordWordInterSentenceRelationArc> arcsRST = new ArrayList<WordWordInterSentenceRelationArc>();
+
+ Map<Integer, List<Pair<String, Integer[]>>> rstMarkersMap = buildMarkers(pt);
+
+ for(int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){
+ for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){
+ // for given arc, find phrases connected by this arc and add to the list of phrases
+
+ List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);
+ List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);
+ List<Pair<String, Integer[]>> markersFrom = rstMarkersMap.get(nSentFrom);
+ List<Pair<String, Integer[]>> markersTo = rstMarkersMap.get(nSentTo);
+ for(WordWordInterSentenceRelationArc arc: arcs){
+ // arc should be coref and link these sentences
+ if (nSentFrom != arc.getCodeFrom().getFirst() ||
+ nSentTo != arc.getCodeTo().getFirst() ||
+ !arc.getArcType().getType().startsWith("coref")
+ )
+ continue;
+ int sentPosFrom = arc.getCodeFrom().getSecond();
+ int sentPosTo = arc.getCodeTo().getSecond();
+ // not more than a single RST link for a pair of sentences
+ boolean bFound = false;
+ for(List<ParseTreeNode> vpFrom: phrasesFrom){
+ if (bFound)
+ break;
+ for(List<ParseTreeNode> vpTo: phrasesTo){
+ for(Pair<String, Integer[]> mFrom: markersFrom){
+ for(Pair<String, Integer[]> mTo: markersTo) {
+ {
+ // the phrases should be on an opposite side of rst marker for a coref link
+ if (isSequence( new Integer[] { sentPosFrom, vpFrom.get(0).getId(), mFrom.getSecond()[0]}) &&
+ isSequence( new Integer[] { sentPosTo, vpTo.get(0).getId(), mTo.getSecond()[0]}) ){
+ ArcType arcType = new ArcType("rst", mFrom.getFirst(), 0, 0);
+
+ WordWordInterSentenceRelationArc arcRST =
+ new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(nSentFrom, mFrom.getSecond()[1]),
+ new Pair<Integer, Integer>(nSentTo, mTo.getSecond()[1]), "", "", arcType);
+ arcsRST.add(arcRST);
+ bFound = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return arcs;
+ }
+
+// check if the word positions occur in sentence in the order Integer[]
+// TODO make more sensitive algo
+ private static boolean isSequence(Integer[] integers) {
+ //TODO better construction of array
+ if (integers==null || integers.length<3)
+ return false;
+ try {
+ for(Integer i: integers)
+ if (i==0)
+ return false;
+ } catch (Exception e) {
+ return false;
+ }
+
+ Boolean bWrongOrder = false;
+ for(int i=1; i< integers.length; i++){
+ if (integers[i-1]>integers[i]){
+ bWrongOrder = true;
+ break;
+ }
+ }
+
+ Boolean bWrongInverseOrder = false;
+ for(int i=1; i< integers.length; i++){
+ if (integers[i-1]<integers[i]){
+ bWrongInverseOrder = true;
+ break;
+ }
+ }
+
+ return !(bWrongOrder && bWrongInverseOrder);
+ }
+
+
+
+ public static void main(String[] args){
+
+
+ }
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,129 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+
+
+public class RhetoricStructureMarker implements IGeneralizer<Integer[]> {
+ //private static String rstRelations[] = {"antithesis", "concession", "contrast", "elaboration"};
+ List<Pair<String, ParseTreeNode[]>> rstMarkers = new ArrayList<Pair<String, ParseTreeNode[]>>();
+
+ public RhetoricStructureMarker(){
+
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>("contrast", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("than",",") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "antithesis", new ParseTreeNode[]{new ParseTreeNode("although",","), new ParseTreeNode("*","*") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("however","*") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("however","*"), new ParseTreeNode(",",","),
+ new ParseTreeNode("*","prp"), }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("*","NN") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode("as","*"), new ParseTreeNode("a","*") }));
+
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>("explanation", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("because",",") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "example", new ParseTreeNode[]{new ParseTreeNode("for","IN"), new ParseTreeNode("example","NN") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("ye","*") }));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode(",",","),
+ new ParseTreeNode("*","prp"), }));
+
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode("i","*"),
+ }));
+
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "explanation", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("where","*") }));
+ //as long as
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","RB"),
+ new ParseTreeNode("as","IN"),}));
+ rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","VB*"),
+ new ParseTreeNode("until","IN"),}));
+
+ }
+
+ /* For a sentence, we obtain a list of markers with the CA word and position in the sentence
+ * Output span is an integer array with start/end occurrence of an RST marker in a sentence
+ * */
+ public List<Pair<String, Integer[]>> extractRSTrelationInSentenceGetBoundarySpan(List<ParseTreeNode> sentence){
+ List<Pair<String, Integer[]>> results = new ArrayList<Pair<String, Integer[]>> ();
+
+ for(Pair<String, ParseTreeNode[]> template: rstMarkers){
+ List<Integer[]> spanList = generalize(sentence,template.getSecond() );
+ if (!spanList.isEmpty())
+ results.add(new Pair<String, Integer[]>(template.getFirst(), spanList.get(0)));
+ }
+ return results;
+ }
+
+ /* Rule application in the form of generalization
+ * Generalizing a sentence with a rule (a template), we obtain the occurrence of rhetoric marker
+ *
+ * o1 - sentence
+ * o2 - rule/template, specifying lemmas and/or POS, including punctuation
+ * @see opennlp.tools.parse_thicket.IGeneralizer#generalize(java.lang.Object, java.lang.Object)
+ * returns the span Integer[]
+ */
+ @Override
+ public List<Integer[]> generalize(Object o1, Object o2) {
+ List<Integer[]> result = new ArrayList<Integer[]>();
+
+ List<ParseTreeNode> sentence = (List<ParseTreeNode> )o1;
+ ParseTreeNode[] template = (ParseTreeNode[]) o2;
+
+ boolean bBeingMatched = false;
+ for(int wordIndexInSentence=0; wordIndexInSentence<sentence.size(); wordIndexInSentence++){
+ ParseTreeNode word = sentence.get(wordIndexInSentence);
+ int wordIndexInSentenceEnd = wordIndexInSentence; //init iterators for internal loop
+ int templateIterator=0;
+ while (wordIndexInSentenceEnd<sentence.size() && templateIterator< template.length){
+ ParseTreeNode tword = template[templateIterator];
+ ParseTreeNode currWord=sentence.get(wordIndexInSentenceEnd);
+ List<ParseTreeNode> gRes = tword.generalize(tword, currWord);
+ if (gRes.isEmpty()|| gRes.get(0)==null || ( gRes.get(0).getWord().equals("*")
+ && gRes.get(0).getPos().equals("*") )){
+ bBeingMatched = false;
+ break;
+ } else {
+ bBeingMatched = true;
+ }
+ wordIndexInSentenceEnd++;
+ templateIterator++;
+ }
+ // template iteration is done
+ // the only condition for successful match is IF we are at the end of template
+ if (templateIterator == template.length){
+ result.add(new Integer[]{wordIndexInSentence, wordIndexInSentenceEnd-1});
+ return result;
+ }
+
+ // no match for current sentence word: proceed to the next
+ }
+ return result;
+ }
+
+ public String markerToString(List<Pair<String, Integer[]>> res){
+ StringBuffer buf = new StringBuffer();
+ buf.append("[");
+ for(Pair<String, Integer[]> marker: res){
+ buf.append(marker.getFirst()+":");
+ for(int a: marker.getSecond()){
+ buf.append(a+" ");
+ }
+ buf.append (" | ");
+ }
+ buf.append("]");
+ return buf.toString();
+ }
+
+ public static void main(String[] args){
+ ParseTreeNode[] sent =
+ new ParseTreeNode[]{new ParseTreeNode("he","prn"), new ParseTreeNode("was","vbz"), new ParseTreeNode("more","jj"),
+ new ParseTreeNode(",",","), new ParseTreeNode("than",","), new ParseTreeNode("little","jj"), new ParseTreeNode("boy","nn"),
+ new ParseTreeNode(",",","), new ParseTreeNode("however","*"), new ParseTreeNode(",",","),
+ new ParseTreeNode("he","prp"), new ParseTreeNode("was","vbz"), new ParseTreeNode("adult","jj")
+ };
+
+ List<Pair<String, Integer[]>> res = new RhetoricStructureMarker().extractRSTrelationInSentenceGetBoundarySpan(Arrays.asList(sent));
+ System.out.println( new RhetoricStructureMarker().markerToString(res));
+ }
+}
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java Mon Jan 6 17:48:30 2014
@@ -17,28 +17,90 @@
package opennlp.tools.similarity.apps;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.net.URLConnection;
-import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
-import org.apache.commons.lang.StringUtils;
-import org.json.JSONArray;
-import org.json.JSONObject;
+import net.billylieurance.azuresearch.AzureSearchImageQuery;
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
public class BingQueryRunner {
- protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
- //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
- // TODO user needs to have own APP_ID from Bing API
+
+ protected static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+ private static final Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.BingQueryRunner");
+ protected AzureSearchWebQuery aq = new AzureSearchWebQuery();
+ private AzureSearchImageQuery iq = new AzureSearchImageQuery();
+
+ public void setKey(String key){
+ BING_KEY = key;
+ }
+
+ public void setLang(String language){
+ aq.setMarket(language);
+ }
+
+ public List<HitBase> runSearch(String query, int nRes) {
+ aq.setAppid(BING_KEY);
+ aq.setQuery(query);
+ aq.setPerPage(nRes);
+ try {
+ aq.doQuery();
+ } catch (Exception e) { // most likely exception is due to limit on bing key
+ aq.setAppid("pjtCgujmf9TtfjCVBdcQ2rBUQwGLmtLtgCG4Ex7kekw");
+ try {
+ aq.doQuery();
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ e.printStackTrace();
+ }
+
+ //org.xml.sax.SAXParseException
+
+ List<HitBase> results = new ArrayList<HitBase> ();
+ AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+
+ for (AzureSearchWebResult anr : ars){
+ HitBase h = new HitBase();
+ h.setAbstractText(anr.getDescription());
+ h.setTitle(anr.getTitle());
+ h.setUrl(anr.getUrl());
+ results.add(h);
+ }
+ return results;
+ }
+
+
+ public AzureSearchResultSet<AzureSearchImageResult> runImageSearch(String query) {
+ iq.setAppid(BING_KEY);
+ iq.setQuery(query);
+ iq.doQuery();
+
+ AzureSearchResultSet<AzureSearchImageResult> ars = iq.getQueryResult();
+
+ return ars;
+ }
+ public int getTotalPagesAtASite(String site)
+ {
+ return runSearch("site:"+site, 1000000).size();
+ }
+
+
+ public List<HitBase> runSearch(String query) {
+ return runSearch(query, 100);
+ }
+
+
+
private float snapshotSimilarityThreshold = 0.4f;
- private static final Logger LOG = Logger
- .getLogger("opennlp.tools.similarity.apps.BingQueryRunner");
+
public void setSnapshotSimilarityThreshold(float thr) {
snapshotSimilarityThreshold = thr;
@@ -53,8 +115,7 @@ public class BingQueryRunner {
}
/*
- *
- */
+
private String constructBingUrl(String query, String domainWeb, String lang,
int numbOfHits) throws Exception {
@@ -73,9 +134,8 @@ public class BingQueryRunner {
return yahooRequest;
}
- /*
- *
- */
+
+
public ArrayList<String> search(String query, String domainWeb, String lang,
int numbOfHits) throws Exception {
URL url = new URL(constructBingUrl(query, domainWeb, lang, numbOfHits));
@@ -145,6 +205,7 @@ public class BingQueryRunner {
hits = HitBase.removeDuplicates(hits);
return hits;
}
+ */
// TODO comment back when dependencies resolved (CopyrightViolations)
/*
@@ -185,10 +246,16 @@ public class BingQueryRunner {
public static void main(String[] args) {
BingQueryRunner self = new BingQueryRunner();
+
+ AzureSearchResultSet<AzureSearchImageResult> res = self.runImageSearch("albert einstein");
+ System.out.println(res);
try {
+ self.setLang("es-MX");
+ self.setKey(
+ "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=");
List<HitBase> resp = self
- .runSearch("Rates rise at weekly Treasury auction");
- // "British Actress Lynn Redgrave dies at 67");
+ .runSearch(//"art scene");
+ "biomecanica las palancas");
System.out.print(resp.get(0));
} catch (Exception e) {
// TODO Auto-generated catch block
@@ -196,6 +263,12 @@ public class BingQueryRunner {
}
/*
+ *
+ * de-DE
+ * es-MX
+ * es-SP
+ */
+ /*
* String[] submittedNews = new String[]{
* "Asian airports had already increased security following the Christmas Day attack, but South Korea and Pakistan are thinking about additional measures."
* ,
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,467 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.parse_thicket.Triple;
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+/*
+ * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine
+ * them in the form
+ * expected to be readable by humans and not distinguishable from genuine content by search engines
+ *
+ */
+
+public class ContentGenerator /*extends RelatedSentenceFinder*/ {
+ private static Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.ContentGenerator");
+ PageFetcher pFetcher = new PageFetcher();
+ ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
+ .getInstance();
+ protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+ protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+ protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
+ protected BingQueryRunner yrunner = new BingQueryRunner();
+ protected ContentGeneratorSupport support = new ContentGeneratorSupport();
+ protected int MAX_STEPS = 1;
+ protected int MAX_SEARCH_RESULTS = 1;
+ protected float RELEVANCE_THRESHOLD = 1.1f;
+
+ //private static final int MAX_FRAGMENT_SENTS = 10;
+
+ public ContentGenerator(int ms, int msr, float thresh, String key) {
+ this.MAX_STEPS = ms;
+ this.MAX_SEARCH_RESULTS = msr;
+ this.RELEVANCE_THRESHOLD=thresh;
+ yrunner.setKey(key);
+ }
+
+ public ContentGenerator() {
+ // TODO Auto-generated constructor stub
+ }
+ public void setLang(String lang) {
+ yrunner.setLang(lang);
+
+ }
+
+
+ /**
+ * Main content generation function which takes a seed as a person, rock
+ * group, or other entity name and produce a list of text fragments by web
+ * mining for <br>
+ *
+ * @param String
+ * entity name
+ * @return List<HitBase> of text fragment structures which contain approved
+ * (in terms of relevance) mined sentences, as well as original search
+ * results objects such as doc titles, abstracts, and urls.
+ */
+
+ public List<HitBase> generateContentAbout(String sentence) throws Exception {
+ List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+ System.out.println(" \n=== Entity to write about = " + sentence);
+
+ int stepCount=0;
+ for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
+ List<HitBase> searchResult = yrunner.runSearch(sentence + " "
+ + verbAddition, MAX_SEARCH_RESULTS); //100);
+ if (MAX_SEARCH_RESULTS<searchResult.size())
+ searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
+ //TODO for shorter run
+ if (searchResult != null) {
+ for (HitBase item : searchResult) { // got some text from .html
+ if (item.getAbstractText() != null
+ && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
+ opinionSentencesToAdd
+ .add(buildParagraphOfGeneratedText(item, sentence, null));
+ }
+ }
+ }
+ stepCount++;
+ if (stepCount>MAX_STEPS)
+ break;
+ }
+
+ opinionSentencesToAdd = ContentGeneratorSupport.removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+ return opinionSentencesToAdd;
+ }
+
+ /**
+ * Takes a sentence and extracts noun phrases and entity names to from search
+ * queries for finding relevant sentences on the web, which are then subject
+ * to relevance assessment by Similarity. Search queries should not be too
+ * general (irrelevant search results) or too specific (too few search
+ * results)
+ *
+ * @param String
+ * input sentence to form queries
+ * @return List<String> of search expressions
+ */
+ public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
+ ParseTreeChunk matcher = new ParseTreeChunk();
+ ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
+ .getInstance();
+ List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+ List<ParseTreeChunk> nPhrases = pos
+ .formGroupedPhrasesFromChunksForSentence(sentence).get(0);
+ List<String> queryArrayStr = new ArrayList<String>();
+ for (ParseTreeChunk ch : nPhrases) {
+ String query = "";
+ int size = ch.getLemmas().size();
+
+ for (int i = 0; i < size; i++) {
+ if (ch.getPOSs().get(i).startsWith("N")
+ || ch.getPOSs().get(i).startsWith("J")) {
+ query += ch.getLemmas().get(i) + " ";
+ }
+ }
+ query = query.trim();
+ int len = query.split(" ").length;
+ if (len < 2 || len > 5)
+ continue;
+ if (len < 4) { // every word should start with capital
+ String[] qs = query.split(" ");
+ boolean bAccept = true;
+ for (String w : qs) {
+ if (w.toLowerCase().equals(w)) // idf only two words then
+ // has to be person name,
+ // title or geo location
+ bAccept = false;
+ }
+ if (!bAccept)
+ continue;
+ }
+
+ query = query.trim().replace(" ", " +");
+ query = " +" + query;
+
+ queryArrayStr.add(query);
+
+ }
+ if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
+ // keywords
+ for (ParseTreeChunk ch : nPhrases) {
+ String query = "";
+ int size = ch.getLemmas().size();
+
+ for (int i = 0; i < size; i++) {
+ if (ch.getPOSs().get(i).startsWith("N")
+ || ch.getPOSs().get(i).startsWith("J")) {
+ query += ch.getLemmas().get(i) + " ";
+ }
+ }
+ query = query.trim();
+ int len = query.split(" ").length;
+ if (len < 2)
+ continue;
+
+ query = query.trim().replace(" ", " +");
+ query = " +" + query;
+
+ queryArrayStr.add(query);
+
+ }
+ }
+
+ queryArrayStr = ContentGeneratorSupport.removeDuplicatesFromQueries(queryArrayStr);
+ queryArrayStr.add(sentence);
+
+ return queryArrayStr;
+
+ }
+
+ private Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
+ if (sentsAll == null)
+ sentsAll = new ArrayList<String>();
+ // put orig sentence in structure
+ List<String> origs = new ArrayList<String>();
+ origs.add(originalSentence);
+ item.setOriginalSentences(origs);
+ String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+ .replace(" ", " ").replace(" ", " ");
+ // generation results for this sentence
+ List<Fragment> result = new ArrayList<Fragment>();
+ // form plain text from snippet
+ String snapshot = item.getAbstractText().replace("<b>", " ")
+ .replace("</b>", " ").replace(" ", " ").replace(" ", " ");
+
+
+ // fix a template expression which can be substituted by original if
+ // relevant
+ String snapshotMarked = snapshot.replace("...",
+ " _should_find_orig_ . _should_find_orig_");
+ String[] fragments = sm.splitSentences(snapshotMarked);
+ List<String> allFragms = new ArrayList<String>();
+ allFragms.addAll(Arrays.asList(fragments));
+
+ String[] sents = null;
+ String downloadedPage = null;
+ try {
+ if (snapshotMarked.length() != snapshot.length()) {
+ downloadedPage = pFetcher.fetchPage(item.getUrl());
+ if (downloadedPage != null && downloadedPage.length() > 100) {
+ item.setPageContent(downloadedPage);
+ String pageContent = Utils.fullStripHTML(item.getPageContent());
+ pageContent = GeneratedSentenceProcessor
+ .normalizeForSentenceSplitting(pageContent);
+ pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
+
+ sents = sm.splitSentences(pageContent);
+
+ sents = ContentGeneratorSupport.cleanListOfSents(sents);
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ // e.printStackTrace();
+ System.err
+ .println("Problem downloading the page and splitting into sentences");
+ return new Triple(allFragms, downloadedPage, sents);
+ }
+ return new Triple(allFragms, downloadedPage, sents);
+ }
+
+ private String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
+ String[] mainAndFollowSent = null;
+
+ List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+ String downloadedPage = (String)fragmentExtractionResults.getSecond();
+ String[] sents = (String[])fragmentExtractionResults.getThird();
+
+ String followSent = null;
+ if (fragment.length() < 50)
+ return null;
+ String pageSentence = "";
+ // try to find original sentence from webpage
+ if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+ && sents.length > 0){
+ try {
+ // first try sorted sentences from page by length approach
+ String[] sentsSortedByLength = support.extractSentencesFromPage(downloadedPage);
+
+
+ try {
+ mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ // if the above gives null than try to match all sentences from snippet fragment
+ if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+ mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sents);
+ }
+
+
+ } catch (Exception e) {
+
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ else
+ // or get original snippet
+ pageSentence = fragment;
+ if (pageSentence != null)
+ pageSentence.replace("_should_find_orig_", "");
+
+ return mainAndFollowSent;
+
+ }
+
+ private Fragment verifyCandidateSentencesAndFormParagraph(
+ String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
+ Fragment result = null;
+
+ String pageSentence = candidateSentences[0];
+ String followSent = "";
+ for(int i = 1; i< candidateSentences.length; i++)
+ followSent+= candidateSentences[i];
+ String title = item.getTitle();
+
+ // resultant sentence SHOULD NOT be longer than for times the size of
+ // snippet fragment
+ if (!(pageSentence != null && pageSentence.length()>50
+ && (float) pageSentence.length() / (float) fragment.length() < 4.0) )
+ return null;
+
+
+ try { // get score from syntactic match between sentence in
+ // original text and mined sentence
+ double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+ SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+ + " " + title, originalSentence);
+ List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+ if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+ System.out
+ .println("Rejected Sentence : No verb OR Yes imperative verb :"
+ + pageSentence);
+ return null;
+ }
+
+ syntScore = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(match);
+ System.out.println(parseTreeChunk.listToString(match) + " "
+ + syntScore + "\n pre-processed sent = '" + pageSentence);
+
+ if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+ for (String currSent : sentsAll) {
+ if (currSent.startsWith(originalSentence))
+ continue;
+ match = sm.assessRelevance(currSent, pageSentence)
+ .getMatchResult();
+ double syntScoreCurr = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(match);
+ if (syntScoreCurr > syntScore) {
+ syntScore = syntScoreCurr;
+ }
+ }
+ if (syntScore > RELEVANCE_THRESHOLD) {
+ System.out.println("Got match with other sent: "
+ + parseTreeChunk.listToString(match) + " " + syntScore);
+ }
+ }
+
+ measScore = stringDistanceMeasurer.measureStringDistance(
+ originalSentence, pageSentence);
+
+
+ if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
+ && measScore < 0.8 && pageSentence.length() > 40) // >70
+ {
+ String pageSentenceProc = GeneratedSentenceProcessor
+ .acceptableMinedSentence(pageSentence);
+ if (pageSentenceProc != null) {
+ pageSentenceProc = GeneratedSentenceProcessor
+ .processSentence(pageSentenceProc);
+ followSent = GeneratedSentenceProcessor.processSentence(followSent);
+ if (followSent != null) {
+ pageSentenceProc += " "+ followSent;
+ }
+
+ pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+ result = new Fragment(pageSentenceProc, syntScore + measScore
+ + mentalScore + (double) pageSentenceProc.length()
+ / (double) 50);
+ result.setSourceURL(item.getUrl());
+ result.fragment = fragment;
+
+ System.out.println("Accepted sentence: " + pageSentenceProc
+ + "| with title= " + title);
+ System.out.println("For fragment = " + fragment);
+ } else
+ System.out
+ .println("Rejected sentence due to wrong area at webpage: "
+ + pageSentence);
+ } else
+ System.out.println("Rejected sentence due to low score: "
+ + pageSentence);
+ // }
+ } catch (Throwable t) {
+ t.printStackTrace();
+ }
+
+ return result;
+}
+ /**
+ * Takes single search result for an entity which is the subject of the essay
+ * to be written and forms essey sentences from the title, abstract, and
+ * possibly original page
+ *
+ * @param HitBase
+ * item : search result
+ * @param originalSentence
+ * : seed for the essay to be written
+ * @param sentsAll
+ * : list<String> of other sentences in the seed if it is
+ * multi-sentence
+ * @return search result
+ */
+ public HitBase buildParagraphOfGeneratedText(HitBase item,
+ String originalSentence, List<String> sentsAll) {
+ List<Fragment> results = new ArrayList<Fragment>() ;
+
+ Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);
+
+ List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+ String downloadedPage = (String)fragmentExtractionResults.getSecond();
+ String[] sents = (String[])fragmentExtractionResults.getThird();
+
+ for (String fragment : allFragms) {
+ String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
+ if (candidateSentences == null)
+ continue;
+ Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
+ if (res!=null)
+ results.add(res);
+
+ }
+
+ item.setFragments(results );
+ return item;
+ }
+
+
+
+
+public static void main(String[] args) {
+ ContentGenerator f = new ContentGenerator();
+
+ List<HitBase> hits = null;
+ try {
+ // uncomment the sentence you would like to serve as a seed sentence for
+ // content generation for an event description
+
+ // uncomment the sentence you would like to serve as a seed sentence for
+ // content generation for an event description
+ hits = f.generateContentAbout("Albert Einstein"
+ // "Britney Spears - The Femme Fatale Tour"
+ // "Rush Time Machine",
+ // "Blue Man Group" ,
+ // "Belly Dance With Zaharah",
+ // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+ // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+ );
+ System.out.println(HitBase.toString(hits));
+ System.out.println(HitBase.toResultantString(hits));
+ // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
+ // hits.get(0).getTitle(), hits);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+}
+
+
+
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps;
+
+import java.util.List;
+
+import javax.mail.internet.AddressException;
+import javax.mail.internet.InternetAddress;
+
+import opennlp.tools.apps.utils.email.EmailSender;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+public class ContentGeneratorRunner {
+ public static void main(String[] args) {
+ ParserChunker2MatcherProcessor sm = null;
+
+ try {
+ String resourceDir = args[2];
+ if (resourceDir!=null)
+ sm = ParserChunker2MatcherProcessor.getInstance(resourceDir);
+ else
+ sm = ParserChunker2MatcherProcessor.getInstance();
+
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ String bingKey = args[7];
+ if (bingKey == null){
+ bingKey = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+ }
+
+ RelatedSentenceFinder f = null;
+ String lang = args[6];
+ if (lang.startsWith("es")){
+ f = new RelatedSentenceFinderML(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);
+ f.setLang(lang);
+ } else
+
+ if (args.length>4 && args[4]!=null)
+ f = new RelatedSentenceFinder(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);
+ else
+ f = new RelatedSentenceFinder();
+
+ List<HitBase> hits = null;
+ try {
+
+ hits = f.generateContentAbout(args[0].replace('+', ' ').replace('"', ' ').trim());
+ System.out.println(HitBase.toString(hits));
+ String generatedContent = HitBase.toResultantString(hits);
+
+ opennlp.tools.apps.utils.email.EmailSender s = new opennlp.tools.apps.utils.email.EmailSender();
+
+ try {
+ s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "borgalor", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{},
+ "Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);
+ } catch (AddressException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (Exception e) {
+
+ e.printStackTrace();
+ try {
+ s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "borgalor", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{},
+ "Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ }
+
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+}
+
+/*
+ * C:\stanford-corenlp>java -Xmx1g -jar pt.jar albert+einstein bgalitsky@hotmail.com C:/stanford-corenlp/src/test/resources
+ *
+ * http://173.255.254.250:8983/solr/contentgen/?q=albert+einstein&email=bgalitsky@hotmail.com&resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&stepsNum=20&searchResultsNum=100&relevanceThreshold=0.5&lang=es-US&bingKey=e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=
+ */