You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC
svn commit: r1555944 [7/11] - in /opennlp/sandbox/opennlp-similarity/src: main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/ main/java/opennlp/tools/apps/contentgen/multithreaded/ main/java/opennlp/tools/apps/relevanceVocabs/ main/j...

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,478 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.parse_thicket.Triple;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunk;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunkComparable;
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import org.apache.commons.lang.StringUtils;
+
+/*
+ * This class supports content generation by static functions
+ * 
+ */
+
+public class ContentGeneratorSupport {
+	private static Logger LOG = Logger
+			.getLogger("opennlp.tools.similarity.apps.ContentGeneratorSupport");
+
+	/**
+	 * Takes a sentence and extracts noun phrases and entity names to from search
+	 * queries for finding relevant sentences on the web, which are then subject
+	 * to relevance assessment by Similarity. Search queries should not be too
+	 * general (irrelevant search results) or too specific (too few search
+	 * results)
+	 * 
+	 * @param String
+	 *          input sentence to form queries
+	 * @return List<String> of search expressions
+	 */
+	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
+		ParseTreeChunk matcher = new ParseTreeChunk();
+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
+				.getInstance();
+		List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+		List<ParseTreeChunk> nPhrases = pos
+				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
+		List<String> queryArrayStr = new ArrayList<String>();
+		for (ParseTreeChunk ch : nPhrases) {
+			String query = "";
+			int size = ch.getLemmas().size();
+
+			for (int i = 0; i < size; i++) {
+				if (ch.getPOSs().get(i).startsWith("N")
+						|| ch.getPOSs().get(i).startsWith("J")) {
+					query += ch.getLemmas().get(i) + " ";
+				}
+			}
+			query = query.trim();
+			int len = query.split(" ").length;
+			if (len < 2 || len > 5)
+				continue;
+			if (len < 4) { // every word should start with capital
+				String[] qs = query.split(" ");
+				boolean bAccept = true;
+				for (String w : qs) {
+					if (w.toLowerCase().equals(w)) // idf only two words then
+						// has to be person name,
+						// title or geo location
+						bAccept = false;
+				}
+				if (!bAccept)
+					continue;
+			}
+
+			query = query.trim().replace(" ", " +");
+			query = " +" + query;
+
+			queryArrayStr.add(query);
+
+		}
+		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
+			// keywords
+			for (ParseTreeChunk ch : nPhrases) {
+				String query = "";
+				int size = ch.getLemmas().size();
+
+				for (int i = 0; i < size; i++) {
+					if (ch.getPOSs().get(i).startsWith("N")
+							|| ch.getPOSs().get(i).startsWith("J")) {
+						query += ch.getLemmas().get(i) + " ";
+					}
+				}
+				query = query.trim();
+				int len = query.split(" ").length;
+				if (len < 2)
+					continue;
+
+				query = query.trim().replace(" ", " +");
+				query = " +" + query;
+
+				queryArrayStr.add(query);
+
+			}
+		}
+
+		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
+		queryArrayStr.add(sentence);
+
+		return queryArrayStr;
+
+	}
+	
+	public static String[] cleanListOfSents(String[] sents) {
+		List<String> sentsClean = new ArrayList<String>();
+		for (String s : sents) {
+			if (s == null || s.trim().length() < 30 || s.length() < 20)
+				continue;
+			sentsClean.add(s);
+		}
+		return (String[]) sentsClean.toArray(new String[0]);
+	}
+
+	public static String cleanSpacesInCleanedHTMLpage(String pageContent){ //was 4 spaces 
+		 //was 3 spaces => now back to 2
+		//TODO - verify regexp!!
+		pageContent = pageContent.trim().replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3")
+				//replaceAll("[a-z]  [A-Z]", ". $0")// .replace("  ",
+				// ". ")
+				.replace("..", ".").replace(". . .", " ").
+				replace(".    .",". ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
+		// we need to put '.'
+		return pageContent;
+	}
+
+	/**
+	 * remove dupes from queries to easy cleaning dupes and repetitive search
+	 * afterwards
+	 * 
+	 * @param List
+	 *          <String> of sentences (search queries, or search results
+	 *          abstracts, or titles
+	 * @return List<String> of sentences where dupes are removed
+	 */
+	public static List<String> removeDuplicatesFromQueries(List<String> hits) {
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double dupeThresh = 0.8; // if more similar, then considered dupes was
+		// 0.7
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<String> hitsDedup = new ArrayList<String>();
+		try {
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++) {
+					String title1 = hits.get(i);
+					String title2 = hits.get(j);
+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+						continue;
+					if (meas.measureStringDistance(title1, title2) > dupeThresh) {
+						idsToRemove.add(j); // dupes found, later list member to
+						// be deleted
+
+					}
+				}
+
+			for (int i = 0; i < hits.size(); i++)
+				if (!idsToRemove.contains(i))
+					hitsDedup.add(hits.get(i));
+
+			if (hitsDedup.size() < hits.size()) {
+				LOG.info("Removed duplicates from formed query, including "
+						+ hits.get(idsToRemove.get(0)));
+			}
+
+		} catch (Exception e) {
+			LOG.severe("Problem removing duplicates from query list");
+		}
+
+		return hitsDedup;
+
+	}
+
+	/**
+	 * remove dupes from search results
+	 * 
+	 * @param List
+	 *          <HitBase> of search results objects
+	 * @return List<String> of search results objects where dupes are removed
+	 */
+	public static List<HitBase> removeDuplicatesFromResultantHits(
+			List<HitBase> hits) {
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double dupeThresh = // 0.8; // if more similar, then considered dupes was
+				0.7;
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<HitBase> hitsDedup = new ArrayList<HitBase>();
+		try {
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++) {
+					HitBase hit2 = hits.get(j);
+					List<Fragment> fragmList1 = hits.get(i).getFragments();
+					List<Fragment> fragmList2 = hits.get(j).getFragments();
+					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
+					for (Fragment f1 : fragmList1)
+						for (Fragment f2 : fragmList2) {
+							String sf1 = f1.getResultText();
+							String sf2 = f2.getResultText();
+							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
+								continue;
+							if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
+								fragmList2Results.remove(f2);
+								LOG.info("Removed duplicates from formed fragments list: "
+										+ sf2);
+							}
+						}
+
+					hit2.setFragments(fragmList2Results);
+					hits.set(j, hit2);
+				}
+		} catch (Exception e) {
+			LOG.severe("Problem removing duplicates from list of fragment");
+		}
+		return hits;
+	}
+
+
+
+	// given a fragment from snippet, finds an original sentence at a webpage by
+	// optimizing alignmemt score
+	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
+			String fragment, String[] sents) {
+		if (fragment.trim().length() < 15)
+			return null;
+
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		Double dist = 0.0;
+		String result = null, followSent = "";
+		for (int i = 0; i < sents.length; i++) {
+			String s = sents[i];
+			if (s == null || s.length() < 30)
+				continue;
+			Double distCurr = meas.measureStringDistance(s, fragment);
+			if (distCurr > dist && distCurr > 0.4) {
+				result = s;
+				dist = distCurr;
+				try {
+					if (i < sents.length - 1 && sents[i + 1].length() > 60) { 
+						String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);
+						if (f1!=null){
+							followSent = f1;
+						}
+					}
+
+					if (i < sents.length - 2 && sents[i + 2].length() > 60) {
+						String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]);
+						if (f2!=null){
+							followSent += " "+f2;
+						}
+					}
+				} catch (Exception e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+			}
+		}
+		return new String[] { result, followSent };
+	}
+
+	// given a fragment from snippet, finds an original sentence at a webpage by
+	// optimizing alignmemt score
+	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
+			String fragment, String[] sents) {
+		if (fragment.trim().length() < 15)
+			return null;
+		int bestSentIndex = -1;
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		Double distBest = 10.0; // + sup
+		String result = null, followSent = null;
+		for (int i = 0; i < sents.length; i++) {
+			String s = sents[i];
+			if (s == null || s.length() < 30)
+				continue;
+			Double distCurr = meas.measureStringDistance(s, fragment);
+			if (distCurr > distBest) {
+				distBest = distCurr;
+				bestSentIndex = i;
+			}
+
+		}
+		if (distBest > 0.4) {
+			result = sents[bestSentIndex];
+
+			if (bestSentIndex < sents.length - 1
+					&& sents[bestSentIndex + 1].length() > 60) {
+				followSent = sents[bestSentIndex + 1];
+			}
+
+		}
+
+		return new String[] { result, followSent };
+	}
+
+	public String[] extractSentencesFromPage(String downloadedPage)
+	{
+
+		int maxSentsFromPage= 100;
+		List<String[]> results = new ArrayList<String[]>();
+
+		//String pageOrigHTML = pFetcher.fetchOrigHTML(url);
+
+		downloadedPage= downloadedPage.replace("     ", "&");
+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
+		String[] sents = downloadedPage.split("#");
+		List<TextChunk> sentsList = new ArrayList<TextChunk>();
+		for(String s: sents){
+			s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);
+			sentsList.add(new TextChunk(s, s.length()));
+		}
+
+		Collections.sort(sentsList, new TextChunkComparable());
+		String[] longestSents = new String[maxSentsFromPage];
+		int j=0;
+		int initIndex = sentsList.size()-1 -maxSentsFromPage;
+		if (initIndex<0)
+			initIndex = 0;
+		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){
+			longestSents[j] = sentsList.get(i).text;
+			j++;
+		}
+
+		sents = cleanSplitListOfSents(longestSents);
+
+		//sents = removeDuplicates(sents);
+		//sents = verifyEnforceStartsUpperCase(sents);
+
+		return sents;
+	}
+
+	public class TextChunk {
+		public TextChunk(String s, int length) {
+			this.text = s;
+			this.len = length;
+		}
+		public String text;
+		public int len;
+	}
+
+	public class TextChunkComparable implements Comparator<TextChunk>
+	{
+		public int compare(TextChunk ch1, TextChunk ch2)
+		{
+			if (ch1.len>ch2.len)
+				return 1;
+			else if (ch1.len<ch2.len)
+				return  -1;
+			else return 0;
+
+		}
+	}
+
+	protected String[] cleanSplitListOfSents(String[] longestSents){
+		float minFragmentLength = 40, minFragmentLengthSpace=4;
+
+		List<String> sentsClean = new ArrayList<String>();
+		for (String sentenceOrMultSent : longestSents)
+		{
+			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)
+				continue;
+			if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
+				System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
+				continue;
+			}
+			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.
+			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
+			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+			if ( avgSentenceLengthInTextPortion<minFragmentLength)
+				continue;
+			// o oo o ooo o o o ooo oo ooo o o oo
+			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
+			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
+				continue;
+
+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
+
+			// forced split by ',' somewhere in the middle of sentence
+			// disused - Feb 26 13
+			//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
+			furtherSplit.remove(furtherSplit.size()-1);
+			for(String s : furtherSplit){
+				if (s.indexOf('|')>-1)
+					continue;
+				s = s.replace("<em>"," ").replace("</em>"," ");
+				s = Utils.convertToASCII(s);
+				sentsClean.add(s);
+			}
+		}
+		return (String[]) sentsClean.toArray(new String[0]);
+	}	
+
+	protected String[] cleanSplitListOfSentsFirstSplit(String[] longestSents){
+		float minFragmentLength = 40, minFragmentLengthSpace=4;
+
+		List<String> sentsClean = new ArrayList<String>();
+		for (String sentenceOrMultSent : longestSents)
+		{
+			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<minFragmentLength)
+				continue;
+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
+			for(String sentence: furtherSplit ){
+				if (sentence==null || sentence.length()<20)
+					continue;
+				if (GeneratedSentenceProcessor.acceptableMinedSentence(sentence)==null){
+					//System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
+					continue;
+				}
+				// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.
+				int numOfDots = sentence.replace('.','&').split("&").length;
+				float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+				if ( avgSentenceLengthInTextPortion<minFragmentLength)
+					continue;
+				// o oo o ooo o o o ooo oo ooo o o oo
+				numOfDots = sentence.replace(' ','&').split("&").length;
+				avgSentenceLengthInTextPortion = (float)sentence.length() /(float) numOfDots;
+				if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
+					continue;
+
+
+
+				// forced split by ',' somewhere in the middle of sentence
+				// disused - Feb 26 13
+				//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
+				//furtherSplit.remove(furtherSplit.size()-1);
+
+				if (sentence.indexOf('|')>-1)
+					continue;
+				sentence = Utils.convertToASCII(sentence);
+				sentsClean.add(sentence);
+			}
+		}
+		return (String[]) sentsClean.toArray(new String[0]);
+	}
+	
+	public static void main(String[] args){
+		String s = "You can grouP   parts  Of your regular expression  In your pattern   You grouP  elements";
+		//with round brackets, e.g., ()." +
+		//		" This allows you to assign a repetition operator to a complete group.";
+		String sr = s.replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3");
+		String sr1 = s.replaceAll("  [A-Z]", ". $0");
+		sr = s.replaceAll("[a-z]  [A-Z]", ". $1");
+		sr1 = s.replaceAll("  [A-Z]", ". $1");
+	}
+
+}
+
+
+

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java Mon Jan  6 17:48:30 2014
@@ -25,189 +25,297 @@ import opennlp.tools.similarity.apps.uti
 import org.apache.commons.lang.StringUtils;
 
 public class GeneratedSentenceProcessor {
-  public static String acceptableMinedSentence(String sent) {
-    // if too many commas => seo text
 
-    String[] commas = StringUtils.split(sent, ',');
-    String[] spaces = StringUtils.split(sent, ' ');
-    if ((float) commas.length / (float) spaces.length > 0.7) {
-      System.out.println("Rejection: too many commas");
-      return null;
-    }
-
-    String[] pipes = StringUtils.split(sent, '|');
-    if (StringUtils.split(sent, '|').length > 2
-        || StringUtils.split(sent, '>').length > 2) {
-      System.out.println("Rejection: too many |s or >s ");
-      return null;
-    }
-    String sentTry = sent.toLowerCase();
-    // if too many long spaces
-    String sentSpaces = sentTry.replace("   ", "");
-    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
-      // suspicious
-      return null;
-
-    if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1
-        || sentTry.indexOf("copyright") > -1
-        || sentTry.indexOf("operating hours") > -1
-        || sentTry.indexOf("days per week") > -1
-        || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
-        || sentTry.indexOf("find the latest") > -1
-        || sentTry.startsWith("subscribe")
-        || sentTry.indexOf("Terms of Service") > -1
-        || sentTry.indexOf("clicking here") > -1
-        || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
-        || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
-        || sentTry.indexOf("available online") > -1
-        || sentTry.indexOf("get online") > -1
-        || sentTry.indexOf("buy online") > -1
-        || sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1
-        || sentTry.indexOf("official site") > -1
-        || sentTry.indexOf("this video") > -1
-        || sentTry.indexOf("this book") > -1
-        || sentTry.indexOf("this product") > -1
-        || sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1
-        || sentTry.indexOf("audio cd") > -1
-        || sentTry.indexOf("related searches") > -1
-        || sentTry.indexOf("permission is granted") > -1
-        || sentTry.indexOf("[edit") > -1
-        || sentTry.indexOf("edit categories") > -1
-        || sentTry.indexOf("free license") > -1
-        || sentTry.indexOf("permission is granted") > -1
-        || sentTry.indexOf("under the terms") > -1
-        || sentTry.indexOf("rights reserved") > -1
-        || sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")
-        || sentTry.endsWith("the.") || sentTry.startsWith("below") 
-        || sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1 
-        ||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1
-        ||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1 
-        ||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1
-        ||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1
-        ||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1
-        ||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
-        
-        ||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1
-        ||sentTry.indexOf( "cond???")>-1 ||sentTry.indexOf( "purchase orders")>-1
-        ||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
-    )
-      return null;
-
-    // count symbols indicating wrong parts of page to mine for text
-    // if short and contains too many symbols indicating wrong area: reject
-    String sentWrongSym = sentTry.replace(">", "&&&").replace("ï¿½", "&&&")
-        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")
-        .replace("-", "&&&").replace("%", "&&&");
-    if ((sentWrongSym.length() - sentTry.length()) >= 4
-        && sentTry.length() < 200) // twice ot more
-      return null;
-
-    sent = sent.replace('[', ' ').replace(']', ' ')
-        .replace("_should_find_orig_", "").replace(".   .", ". ")
-        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")
-        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")
-        .replace("2008", "2011").replace("2006", "2011")
-        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")
-        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")
-        .replace("(more.)", "").replace("more.", "").replace("<more>", "")
-        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")
-        .replace("p&gt;", "").replace("product description", "");
-
-    // TODO .replace("a.", ".");
-
-    int endIndex = sent.indexOf(" posted");
-    if (endIndex > 0)
-      sent = sent.substring(0, endIndex);
-
-    return sent;
-  }
-
-  public static String processSentence(String pageSentence) {
-    if (pageSentence == null)
-      return "";
-    pageSentence = Utils.fullStripHTML(pageSentence);
-    pageSentence = StringUtils.chomp(pageSentence, "..");
-    pageSentence = StringUtils.chomp(pageSentence, ". .");
-    pageSentence = StringUtils.chomp(pageSentence, " .");
-    pageSentence = StringUtils.chomp(pageSentence, ".");
-    pageSentence = StringUtils.chomp(pageSentence, "...");
-    pageSentence = StringUtils.chomp(pageSentence, " ....");
-    pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")
-        .replace("(.)", "");
-
-    pageSentence = pageSentence.trim();
-    pageSentence = pageSentence.replaceAll("\\s+", " "); // make single
-    // spaces
-    // everywhere
-
-    String[] pipes = StringUtils.split(pageSentence, '|'); // removed
-    // shorter part
-    // of sentence
-    // at the end
-    // after pipe
-    if (pipes.length == 2
-        && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {
-      int pipePos = pageSentence.indexOf("|");
-      if (pipePos > -1)
-        pageSentence = pageSentence.substring(0, pipePos - 1).trim();
-
-    }
-
-    if (!StringUtils.contains(pageSentence, '.')
-        && !StringUtils.contains(pageSentence, '?')
-        && !StringUtils.contains(pageSentence, '!'))
-      pageSentence = pageSentence + ". ";
-
-    pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();
-    if (!pageSentence.endsWith("."))
-      pageSentence += ". ";
-    return pageSentence;
-  }
-
-  public static void main(String[] args) {
-
-    String para = "About Albert Einstein     15 External links  16 Credits         Youth and schooling  Albert Einstein was born into a Jewish family";
-    para = "inventions of albert einstein                            what was albert einsteins invention                            invention of einstein                            what were albert einsteins inventions ";
-
-    para = para.replaceAll("  [A-Z]", ". $0");
-    System.out.println(para);
-
-    para = "Page 2 of 93";
-
-    System.exit(0);
-    RelatedSentenceFinder f = new RelatedSentenceFinder();
-    try {
-      List<HitBase> hits = f
-          .findRelatedOpinionsForSentence(
-              "Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",
-              Arrays
-                  .asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));
-      StringBuffer buf = new StringBuffer();
-
-      for (HitBase h : hits) {
-        List<Fragment> frags = h.getFragments();
-        for (Fragment fr : frags) {
-          if (fr.getResultText() != null && fr.getResultText().length() > 3)
-            buf.append(fr.getResultText());
-        }
-      }
-
-    } catch (Exception e) {
-      // TODO Auto-generated catch block
-      e.printStackTrace();
-    }
-
-  }
-
-  public static String normalizeForSentenceSplitting(String pageContent) {
-    pageContent.replace("Jan.", "January").replace("Feb.", "February")
-        .replace("Mar.", "March").replace("Apr.", "April")
-        .replace("Jun.", "June").replace("Jul.", "July")
-        .replace("Aug.", "August").replace("Sep.", "September")
-        .replace("Oct.", "October").replace("Nov.", "November")
-        .replace("Dec.", "December");
+	public static String[] occurs = new String[]{ "click here", "wikipedia", "retrieved", "isbn",
+		"http", "www.",
+		"copyright", "advertise",  "(accessed", "[edit]", "[citation needed]",
+		"site map",  "email updates",  "contact us", "rss feeds",  "cite this site",
+		"operating hours", "last modified", "product catalog",
+		"days per week", "leave a comment", "corporate information",  
+		"employment opportunities", "terms of use", "private policy", "parental guidelines", "copyright policy",  "ad choices",
+		"about us",  "about our ads",  "privacy policy",  "terms of use",
+		"click for", "photos",
+		"find the latest",		       
+		"terms of service",
+		"clicking here",
+		"skip to", "sidebar",
+		"Tags:", 
+		"available online",
+		"get online",
+		"buy online",
+		"not valid", "get discount",
+		"official site",
+		"this video",
+		//"this book",
+		"this product",
+		"paperback", "hardcover",
+		"audio cd",
+		"related searches",
+		"permission is granted",
+		"[edit",
+		"edit categories",
+		"free license",
+		"permission is granted",
+		"under the terms",
+		"rights reserved",
+		"wikipedia", 
+		"recipient of", "this message", 
+		"mailing list",  "purchase order",
+		"mon-fri",  "email us",  "privacy pol",  "back to top", 
+		"click here",  "for details",  "assistance?",  "chat live",
+		"free shipping",  "company info",  "satisfaction g",  "contact us",
+		"menu.", "search.",  "sign in", "home.",
+		"additional terms", "may apply"};
+
+	public static String[] occursStartsWith = new String[]{
+		"fax",  "write","email", "contact",  "conditions",  "chat live",
+		"we ",  "the recipient",  "day return",  "days return",
+		"refund it",  "your money",
+		"purchase orders",
+		"exchange it ",  "return it",  "day return",  "days return",
+		"subscribe","posted by", "below" , "corporate",
+		"this book"};
+	public static String acceptableMinedSentence(String sent) {
+		if (sent==null || sent.length()<40)
+			return null;
+		// if too many commas => seo text
+
+		String[] commas = StringUtils.split(sent, ',');
+		String[] spaces = StringUtils.split(sent, ' ');
+		if ((float) commas.length / (float) spaces.length > 0.5) {
+			System.out.println("Rejection: too many commas  in sent ='"+sent);
+			return null;
+		}
+
+		String[] periods = StringUtils.split(sent.replace('.', '#'), '#');
+		if ((float) periods.length / (float) spaces.length > 0.2) {
+			System.out.println("Rejection: too many periods in sent ='"+sent);
+			return null;
+		}
+		// commented [x], to avoid rejection sentences with refs[]
+		String[] brakets = StringUtils.split(sent.replace('(', '#').replace(')', '#')/*.replace('[', '#').replace(']', '#')*/, '#');
+		if ((float) periods.length / (float) spaces.length > 0.2) {
+			System.out.println("Rejection: too many brakets in sent ='"+sent);
+			return null;
+		}
+		
+		String[] pipes = StringUtils.split(sent, '|');
+		if (StringUtils.split(sent, '|').length > 2
+				|| StringUtils.split(sent, '>').length > 2) {
+			System.out.println("Rejection: too many |s or >s in sent ='"+sent);
+			return null;
+		}
+		String sentTry = sent.toLowerCase();
+		// if too many long spaces
+		String sentSpaces = sentTry.replace("   ", "");
+		if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
+			// suspicious
+			return null;
+		if (isProhibitiveWordsOccurOrStartWith(sentTry))
+			return null;
+
+		
+
+		// count symbols indicating wrong parts of page to mine for text
+		// if short and contains too many symbols indicating wrong area: reject
+		String sentWrongSym = sentTry.replace(">", "&&&").replace("ï¿½", "&&&")
+				.replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")
+				.replace("-", "&&&").replace("%", "&&&");
+		if ((sentWrongSym.length() - sentTry.length()) >= 4
+				&& sentTry.length() < 200) // twice ot more
+			return null;
+
+		sent = sent.replace('[', ' ').replace(']', ' ')
+				.replace("_should_find_orig_", "").replace(".   .", ". ")
+				.replace("amp;", " ").replace("1.", " ").replace("2.", " ")
+				.replace("3.", " ").replace("4.", " ").
+			/*	.replace("2009", "2011")
+				.replace("2008", "2011").replace("2006", "2011")
+				.replace("2007", "2011").
+			*/	replace("VIDEO:", " ").replace("Video:", " ")
+				.replace("no comments", " ").replace("  ", " ").replace("  ", " ")
+				.replace("(more.)", "").replace("more.", "").replace("<more>", "")
+				.replace("[more]", "").replace(".,", ".").replace("&lt;", "")
+				.replace("p&gt;", "").replace("product description", "");
+
+		//sent = sent.replace("Click here. ","").replace("Share this:.","").replace("Facebook.",""). 
+		//		replace("Twitter." Email. Google. Print. Tumblr. Pinterest. More. Digg. LinkedIn. StumbleUpon. Reddit. Like this: Like Loading.. ")
+
+		// TODO .replace("a.", ".");
+
+		int endIndex = sent.indexOf(" posted");
+		if (endIndex > 0)
+			sent = sent.substring(0, endIndex);
+
+		return sent;
+	}
+
+	public static String processSentence(String pageSentence) {
+		if (acceptableMinedSentence(pageSentence)==null){
+			System.out.println("Rejected sentence by GenerSentProc.processSentence.acceptableMinedSentence()");
+			return "";
+		}
+		if (pageSentence == null)
+			return "";
+		pageSentence = Utils.fullStripHTML(pageSentence);
+		pageSentence = StringUtils.chomp(pageSentence, "..");
+		pageSentence = StringUtils.chomp(pageSentence, ". .");
+		pageSentence = StringUtils.chomp(pageSentence, " .");
+		pageSentence = StringUtils.chomp(pageSentence, ".");
+		pageSentence = StringUtils.chomp(pageSentence, "...");
+		pageSentence = StringUtils.chomp(pageSentence, " ....");
+		pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")
+				.replace("(.)", "");
+
+		pageSentence = pageSentence.trim();
+		pageSentence = pageSentence.replaceAll("\\s+", " "); // make single
+		// spaces
+		// everywhere
+
+		String[] pipes = StringUtils.split(pageSentence, '|'); // removed
+		// shorter part
+		// of sentence
+		// at the end
+		// after pipe
+		if (pipes.length == 2
+				&& ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {
+			int pipePos = pageSentence.indexOf("|");
+			if (pipePos > -1)
+				pageSentence = pageSentence.substring(0, pipePos - 1).trim();
+
+		}
+
+		if (!StringUtils.contains(pageSentence, '.')
+				&& !StringUtils.contains(pageSentence, '?')
+				&& !StringUtils.contains(pageSentence, '!'))
+			pageSentence = pageSentence + ". ";
+
+		pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();
+		if (!pageSentence.endsWith(".") && !pageSentence.endsWith(":") 
+				&&!pageSentence.endsWith("!") &&!pageSentence.endsWith("."))
+			pageSentence += ". ";
+		return pageSentence;
+	}
+
+	public static boolean isProhibitiveWordsOccurOrStartWith(String sentenceLowercase){
+		for(String o: occurs){
+			if (sentenceLowercase.indexOf(o)>-1){
+				System.out.println("Found prohibited occurrence "+ o +" \n in sentence = "+  sentenceLowercase);
+				return true;
+			}
+		}
+
+		for(String o: occursStartsWith){
+			if (sentenceLowercase.startsWith(o)){
+				System.out.println("Found prohibited occurrence Start With  "+ o +" \n in sentence = "+  sentenceLowercase);
+				return true;
+			}
+		}
+
+
+
+		//  || sentTry.endsWith("the")
+		//  || sentTry.endsWith("the.") || sentTry.startsWith("below") 
+		return false;
+	}
+
+	public static void main(String[] args) {
+		
+		String sentence = "Accepted sentence: Educational. Video. About Us menu. Home. Nobel Prizes and Laureates. Nobel Prizes and Laureates. Physics Prize. Chemistry Prize. Medicine Prize. Literature Prize. Peace Prize. Prize in Economic Sciences. Quick Facts. Nomination. Nomination. Physics Prize. Chemistry Prize. Medicine Prize. Literature Prize. Peace Prize. Prize in Economic Sciences. Nomination Archive. Ceremonies. Ceremonies. Ceremony Archive. Nobel Banquet Menus. Nobel Banquet Dress Code. The Queen's Gowns. Eyewitness Reports. Alfred Nobel. Alfred Nobel. Alfred Nobel's Will. Alfred Nobel's Life. Private Library of Alfred Nobel. Books on Alfred Nobel. Events. Events. Nobel Week Dialogue. Nobel Prize Inspiration Initiative. Nobel Prize Concert. Exhibitions at the Nobel Museum. Exhibitions at the Nobel Peace Center. About Us. Nobel Prizes and Laureates. Physics PrizesChemistry PrizesMedicine PrizesLiterature PrizesPeace PrizesPrize in Economic Sciences. About the Nobel Prize in Phys
 ics 1921. Albert Einstein. Facts. Biographical. Nobel Lecture. Banquet Speech. Documentary. Photo Gallery. Questions and Answers. Other Resources. All Nobel Prizes in Physics. All Nobel Prizes in 1921. The Nobel Prize in Physics 1921. Albert Einstein. Questions and Answers. Question: When was Albert Einstein born . Answer: Albert Einstein was born on 14 March 1879. Question: Where was he born . Answer: He was born in Ulm, Germany. Question: When did he die . Answer: He died 18 April 1955 in Princeton, New Jersey, USA. Question: Who were his parents . Answer: His father was Hermann Einstein and his mother was Pauline Einstein (born Koch). Question: Did he have any sisters and brothers . Answer: He had one sister named Maja. Question: Did he marry and have children . Answer: He was married to Mileva Mari between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa L Kwenthal in 1919 and they lived together until h
 er death in 1936. Question: Where did he receive his education . Answer: He received his main education at the following schools:. Catholic elementary school in Munich, Germany (1885-1888). Luitpold Gymnasium in Munich, Germany (1888-1894). Cantonal school in Aarau, Switzerland (1895-1896). Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900). Ph.D. from Zurich University, Switzerland (1905). Question: When was Albert Einstein awarded the Nobel Prize in Physics . Answer: The Nobel Prize Awarding Institution, the Royal Swedish Academy of Sciences, decided to reserve the Nobel Prize in Physics in 1921, and therefore no Physics Prize was awarded that year.";
+		
+		String res = GeneratedSentenceProcessor.acceptableMinedSentence(sentence);
+
+		String para = "About Albert Einstein     15 External links  16 Credits         Youth and schooling  Albert Einstein was born into a Jewish family";
+		para = "inventions of albert einstein                            what was albert einsteins invention                            invention of einstein                            what were albert einsteins inventions ";
+
+		para = para.replaceAll("  [A-Z]", ". $0");
+		System.out.println(para);
+
+		para = "Page 2 of 93";
+
+		System.exit(0);
+		RelatedSentenceFinder f = new RelatedSentenceFinder();
+		try {
+			List<HitBase> hits = f
+					.findRelatedOpinionsForSentence(
+							"Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",
+							Arrays
+							.asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));
+			StringBuffer buf = new StringBuffer();
+
+			for (HitBase h : hits) {
+				List<Fragment> frags = h.getFragments();
+				for (Fragment fr : frags) {
+					if (fr.getResultText() != null && fr.getResultText().length() > 3)
+						buf.append(fr.getResultText());
+				}
+			}
+
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+
+	public static String normalizeForSentenceSplitting(String pageContent) {
+		pageContent.replace("Jan.", "January").replace("Feb.", "February")
+		.replace("Mar.", "March").replace("Apr.", "April")
+		.replace("Jun.", "June").replace("Jul.", "July")
+		.replace("Aug.", "August").replace("Sep.", "September")
+		.replace("Oct.", "October").replace("Nov.", "November")
+		.replace("Dec.", "December");
+
+		return pageContent;
+
+	}
+}
+
+/*
+
+if (sentTry.indexOf("click here")>-1 || sentTry.indexOf(" wikip") > -1
+|| sentTry.indexOf("copyright") > -1
+|| sentTry.indexOf("operating hours") > -1
+|| sentTry.indexOf("days per week") > -1
+|| sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
+|| sentTry.indexOf("find the latest") > -1
+|| sentTry.startsWith("subscribe")
+|| sentTry.indexOf("Terms of Service") > -1
+|| sentTry.indexOf("clicking here") > -1
+|| sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
+|| sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
+|| sentTry.indexOf("available online") > -1
+|| sentTry.indexOf("get online") > -1
+|| sentTry.indexOf("buy online") > -1
+|| sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1
+|| sentTry.indexOf("official site") > -1
+|| sentTry.indexOf("this video") > -1
+|| sentTry.indexOf("this book") > -1
+|| sentTry.indexOf("this product") > -1
+|| sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1
+|| sentTry.indexOf("audio cd") > -1
+|| sentTry.indexOf("related searches") > -1
+|| sentTry.indexOf("permission is granted") > -1
+|| sentTry.indexOf("[edit") > -1
+|| sentTry.indexOf("edit categories") > -1
+|| sentTry.indexOf("free license") > -1
+|| sentTry.indexOf("permission is granted") > -1
+|| sentTry.indexOf("under the terms") > -1
+|| sentTry.indexOf("rights reserved") > -1
+|| sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")
+|| sentTry.endsWith("the.") || sentTry.startsWith("below") 
+|| sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1 
+||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1
+||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1 
+||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1
+||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1
+||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1
+||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
+
+||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1
+||sentTry.indexOf( "cond???")>-1 ||sentTry.indexOf( "purchase orders")>-1
+||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
+)
+return null;
 
-    return pageContent;
-
-  }
-}
\ No newline at end of file
+*/
\ No newline at end of file

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java Mon Jan  6 17:48:30 2014
@@ -26,214 +26,236 @@ import opennlp.tools.similarity.apps.uti
 import org.apache.commons.lang.StringUtils;
 
 public class HitBase {
-  private static final Logger LOG = Logger
-      .getLogger("opennlp.tools.similarity.apps.HitBase");
+	private static final Logger LOG = Logger
+			.getLogger("opennlp.tools.similarity.apps.HitBase");
 
-  private String abstractText;
+	private String abstractText;
 
-  private String clickUrl;
+	private String clickUrl;
 
-  private String displayUrl;
+	private String displayUrl;
 
-  private String url;
-
-  private String date;
-
-  private String title;
-
-  private Double generWithQueryScore;
-
-  private String source;
-
-  private List<String> originalSentences;
-
-  private String pageContent;
-
-  private List<Fragment> fragments;
-
-  public HitBase() {
-    super();
-  }
-
-  public String getPageContent() {
-    return pageContent;
-  }
-
-  public HitBase(String orig, String[] generateds) {
-    originalSentences = new ArrayList<String>();
-    originalSentences.add(orig);
-
-    fragments = new ArrayList<Fragment>();
-    for (String sent : generateds) {
-      Fragment f = new Fragment(sent, 0.0);
-      fragments.add(f);
-    }
-    // the rest of params are null
-  }
-
-  public void setPageContent(String pageContent) {
-    this.pageContent = pageContent;
-  }
-
-  public List<Fragment> getFragments() {
-    return fragments;
-  }
-
-  public void setFragments(List<Fragment> fragments) {
-    this.fragments = fragments;
-  }
-
-  public String getSource() {
-    return source;
-  }
-
-  public void setSource(String source) {
-    this.source = source;
-  }
-
-  public List<String> getOriginalSentences() {
-    return originalSentences;
-  }
-
-  public void setOriginalSentences(List<String> originalSentences) {
-    this.originalSentences = originalSentences;
-  }
-
-  public String getTitle() {
-    return title;
-  }
-
-  public void setTitle(String title) {
-    this.title = title;
-  }
-
-  public String getAbstractText() {
-    return abstractText;
-  }
-
-  public void setAbstractText(String abstractText) {
-    this.abstractText = abstractText;
-  }
-
-  public String getClickUrl() {
-    return clickUrl;
-  }
-
-  public void setClickUrl(String clickUrl) {
-    this.clickUrl = clickUrl;
-  }
-
-  public String getDisplayUrl() {
-    return displayUrl;
-  }
-
-  public void setDisplayUrl(String displayUrl) {
-    this.displayUrl = displayUrl;
-  }
-
-  public String getUrl() {
-    return url;
-  }
-
-  public void setUrl(String url) {
-    this.url = url;
-  }
-
-  public String getDate() {
-    return date;
-  }
-
-  public void setDate(String date) {
-    this.date = date;
-  }
-
-  public Double getGenerWithQueryScore() {
-    return generWithQueryScore;
-  }
-
-  public void setGenerWithQueryScore(Double generWithQueryScore) {
-    this.generWithQueryScore = generWithQueryScore;
-  }
-
-  public String toString() {
-    // return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+
-    // this.abstractText ;
-    if (this.getFragments() != null && this.getFragments().size() > 0)
-      return this.getFragments().toString();
-    else
-      return this.title;
-  }
-
-  public static String toString(List<HitBase> hits) {
-    StringBuffer buf = new StringBuffer();
-    Boolean pBreak = true;
-    for (HitBase hit : hits) {
-      String fragm = (hit.toString());
-      if (fragm.length() > 15) {
-        if (pBreak)
-          buf.append(fragm + " | ");
-        else
-          buf.append(fragm + " | \n");
-        // switch to opposite
-        if (pBreak)
-          pBreak = false;
-        else
-          pBreak = true;
-      }
-
-    }
-    return buf.toString();
-  }
-
-  public static String toResultantString(List<HitBase> hits) {
-    StringBuffer buf = new StringBuffer();
-    Boolean pBreak = true;
-    for (HitBase hit : hits) {
-      String fragm = hit.getFragments().toString();
-      if (fragm.length() > 15) {
-        if (pBreak)
-          buf.append(fragm + " | 	");
-        else
-          buf.append(fragm + " | \n");
-        // switch to opposite
-        if (pBreak)
-          pBreak = false;
-        else
-          pBreak = true;
-      }
-
-    }
-    return buf.toString().replace("[", "").replace("]", "").replace(" | ", "")
-        .replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
-        .replace(",.", ".");
-  }
-
-  public static List<HitBase> removeDuplicates(List<HitBase> hits) {
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    double imageDupeThresh = 0.8; // if more similar, then considered dupes
-    List<Integer> idsToRemove = new ArrayList<Integer>();
-    List<HitBase> hitsDedup = new ArrayList<HitBase>();
-    try {
-      for (int i = 0; i < hits.size(); i++)
-        for (int j = i + 1; j < hits.size(); j++) {
-          String title1 = hits.get(i).getTitle();
-          String title2 = hits.get(j).getTitle();
-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
-            continue;
-          if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
-            idsToRemove.add(j); // dupes found, later list member to be deleted
-          }
-        }
-      for (int i = 0; i < hits.size(); i++)
-        if (!idsToRemove.contains(i))
-          hitsDedup.add(hits.get(i));
-      if (hitsDedup.size() < hits.size()) {
-        LOG.info("Removed duplicates from relevant search results, including "
-            + hits.get(idsToRemove.get(0)).getTitle());
-      }
-    } catch (Exception e) {
-      LOG.severe("Problem removing duplicates from relevant images: " + e);
-    }
-    return hitsDedup;
-  }
+	private String url;
+
+	private String date;
+
+	private String title;
+
+	private Double generWithQueryScore;
+
+	private String source;
+
+	private List<String> originalSentences;
+
+	private String pageContent;
+
+	private List<Fragment> fragments;
+
+	public HitBase() {
+		super();
+	}
+
+	public String getPageContent() {
+		return pageContent;
+	}
+
+	public HitBase(String orig, String[] generateds) {
+		originalSentences = new ArrayList<String>();
+		originalSentences.add(orig);
+
+		fragments = new ArrayList<Fragment>();
+		for (String sent : generateds) {
+			Fragment f = new Fragment(sent, 0.0);
+			fragments.add(f);
+		}
+		// the rest of params are null
+	}
+
+	public void setPageContent(String pageContent) {
+		this.pageContent = pageContent;
+	}
+
+	public List<Fragment> getFragments() {
+		return fragments;
+	}
+
+	public void setFragments(List<Fragment> fragments) {
+		this.fragments = fragments;
+	}
+
+	public String getSource() {
+		return source;
+	}
+
+	public void setSource(String source) {
+		this.source = source;
+	}
+
+	public List<String> getOriginalSentences() {
+		return originalSentences;
+	}
+
+	public void setOriginalSentences(List<String> originalSentences) {
+		this.originalSentences = originalSentences;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getAbstractText() {
+		return abstractText;
+	}
+
+	public void setAbstractText(String abstractText) {
+		this.abstractText = abstractText;
+	}
+
+	public String getClickUrl() {
+		return clickUrl;
+	}
+
+	public void setClickUrl(String clickUrl) {
+		this.clickUrl = clickUrl;
+	}
+
+	public String getDisplayUrl() {
+		return displayUrl;
+	}
+
+	public void setDisplayUrl(String displayUrl) {
+		this.displayUrl = displayUrl;
+	}
+
+	public String getUrl() {
+		return url;
+	}
+
+	public void setUrl(String url) {
+		this.url = url;
+	}
+
+	public String getDate() {
+		return date;
+	}
+
+	public void setDate(String date) {
+		this.date = date;
+	}
+
+	public Double getGenerWithQueryScore() {
+		return generWithQueryScore;
+	}
+
+	public void setGenerWithQueryScore(Double generWithQueryScore) {
+		this.generWithQueryScore = generWithQueryScore;
+	}
+
+	public String toString() {
+		// return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+
+				// this.abstractText ;
+		if (this.getFragments() != null && this.getFragments().size() > 0)
+			return this.getFragments().toString();
+		else
+			return this.title;
+	}
+
+	public static String toString(List<HitBase> hits) {
+		StringBuffer buf = new StringBuffer();
+		Boolean pBreak = true;
+		for (HitBase hit : hits) {
+			String fragm = (hit.toString());
+			if (fragm.length() > 15) {
+				if (pBreak)
+					buf.append(fragm + " | ");
+				else
+					buf.append(fragm + " | \n");
+				// switch to opposite
+				if (pBreak)
+					pBreak = false;
+				else
+					pBreak = true;
+			}
+
+		}
+		return buf.toString();
+	}
+
+	public static String toResultantString(List<HitBase> hits) {
+		StringBuffer buf = new StringBuffer();
+		Boolean pBreak = true;
+		for (HitBase hit : hits) {
+			try {
+				if (hit.getFragments()==null)	
+					continue;
+				String fragm = hit.getFragments().toString();
+				if (fragm.length() > 15) {
+					if (pBreak)
+						buf.append(fragm + " | 	");
+					else
+						buf.append(fragm + " | <br>\n");
+					// switch to opposite
+					if (pBreak)
+						pBreak = false;
+					else
+						pBreak = true;
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+
+		}
+		return buf.toString().replace("[", "").replace("]", "").replace(" | ", "")
+				.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
+				.replace(",.", ".");
+	}
+	
+	public static String produceReferenceSection(List<HitBase> hits) {
+		StringBuffer buf = new StringBuffer();
+		for (HitBase hit : hits) {
+			try {
+				if (hit.getUrl()==null)	
+					continue;
+				buf.append(hit.getUrl());					
+			
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+
+		}
+		return buf.toString();
+	}
+
+	public static List<HitBase> removeDuplicates(List<HitBase> hits) {
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double imageDupeThresh = 0.8; // if more similar, then considered dupes
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<HitBase> hitsDedup = new ArrayList<HitBase>();
+		try {
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++) {
+					String title1 = hits.get(i).getTitle();
+					String title2 = hits.get(j).getTitle();
+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+						continue;
+					if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
+						idsToRemove.add(j); // dupes found, later list member to be deleted
+					}
+				}
+			for (int i = 0; i < hits.size(); i++)
+				if (!idsToRemove.contains(i))
+					hitsDedup.add(hits.get(i));
+			if (hitsDedup.size() < hits.size()) {
+				LOG.info("Removed duplicates from relevant search results, including "
+						+ hits.get(idsToRemove.get(0)).getTitle());
+			}
+		} catch (Exception e) {
+			LOG.severe("Problem removing duplicates from relevant images: " + e);
+		}
+		return hitsDedup;
+	}
 }
\ No newline at end of file

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java Mon Jan  6 17:48:30 2014
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps;
 
 import java.util.Comparator;