You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/09/19 20:46:32 UTC
svn commit: r1387703 - in
/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools:
similarity/apps/ similarity/apps/utils/ textsimilarity/chunker2matcher/
Author: bgalitsky
Date: Wed Sep 19 18:46:31 2012
New Revision: 1387703
URL: http://svn.apache.org/viewvc?rev=1387703&view=rev
Log:
OPENNLP-537: make an access to generic search engines to demonstrate search results re-ranking
Added:
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java
- copied, changed from r1362702, opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
Removed:
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
Modified:
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ (props changed)
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
Propchange: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Sep 19 18:46:31 2012
@@ -0,0 +1 @@
+relevance_verifier
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java Wed Sep 19 18:46:31 2012
@@ -31,7 +31,8 @@ import org.json.JSONArray;
import org.json.JSONObject;
public class BingQueryRunner {
- protected static final String APP_ID = "DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
+ protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+ //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
// TODO user needs to have own APP_ID from Bing API
private float snapshotSimilarityThreshold = 0.4f;
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java Wed Sep 19 18:46:31 2012
@@ -42,13 +42,14 @@ public class BingWebQueryRunner {
throws Exception {
String codedQuery = URLEncoder.encode(query, "UTF-8");
- String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
- + BingQueryRunner.APP_ID + "&query=" + codedQuery
- + "&Sources=Web"
+ String yahooRequest = "https://api.datamarket.azure.com/Bing/SearchWeb"
+ // "http://api.search.live.net/json.aspx?Appid="
+ + BingQueryRunner.APP_ID + "&Query=" + codedQuery ;
+ // + "&Sources=Web"
// Common request fields (optional)
- + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
+ // + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
// News-specific request fields (optional)
- + "&News.Offset=0";
+ // + "&News.Offset=0";
return yahooRequest;
}
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java Wed Sep 19 18:46:31 2012
@@ -33,25 +33,20 @@ public class SearchResultsProcessor exte
ParserChunker2MatcherProcessor sm;
/*
- * Takes Bing API search results and calculates the parse tree similarity
+ * Takes a search engine API (or scraped) search results and calculates the parse tree similarity
* between the question and each snippet. Ranks those snippets with higher
* similarity score up
*/
- private BingResponse calculateMatchScoreResortHits(BingResponse resp,
+
+
+ private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,
String searchQuery) {
- // TODO
- /*
- * if query is multi-sentence, special handling int indexDot =
- * searchQuery.indexOf("."); if (indexDot>0 &&
- * indexDot<searchQuery.length()-1){ MultipleSentenceQueryAnswerer ans = new
- * MultipleSentenceQueryAnswerer(); return
- * ans.calculateMatchScoreResortHits(resp, searchQuery); }
- */
+
List<HitBase> newHitList = new ArrayList<HitBase>();
sm = ParserChunker2MatcherProcessor.getInstance();
- for (HitBase hit : resp.getHits()) {
- String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
+ for (HitBase hit : hits) {
+ String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")
.replace("<b>", "").replace("</b>", "");
snapshot = snapshot.replace("</B>", "").replace("<B>", "")
.replace("<br>", "").replace("</br>", "").replace("...", ". ")
@@ -72,13 +67,13 @@ public class SearchResultsProcessor exte
newHitList.add(hit);
}
Collections.sort(newHitList, new HitBaseComparable());
- resp.setHits(newHitList);
+
LOG.info("\n\n ============= NEW ORDER ================= ");
for (HitBase hit : newHitList) {
LOG.info(hit.toString());
}
- return resp;
+ return newHitList;
}
public void close() {
@@ -86,13 +81,21 @@ public class SearchResultsProcessor exte
}
public List<HitBase> runSearch(String query) {
+
+ WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
+ List<HitBase> hits = scraper.runSearch(query);
+ hits = calculateMatchScoreResortHits(hits, query);
+ return hits;
+ }
+
+ public List<HitBase> runSearchViaAPI(String query) {
BingResponse resp = null, // obtained from bing
newResp = null; // re-sorted based on similarity
try {
List<String> resultList = search(query, "", "", 30);
resp = populateBingHit(resultList.get(0));
// now we apply our own relevance filter
- newResp = calculateMatchScoreResortHits(resp, query);
+ newResp.setHits(calculateMatchScoreResortHits(resp.getHits(), query));
} catch (Exception e) {
// e.printStackTrace();
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java Wed Sep 19 18:46:31 2012
@@ -27,11 +27,12 @@ import opennlp.tools.textsimilarity.Pars
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-public class SpeechRecognitionResultsProcessor extends BingWebQueryRunner {
+public class SpeechRecognitionResultsProcessor /*extends BingWebQueryRunner*/ {
private static Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.SpeechRecognitionResultsProcessor");
private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
ParserChunker2MatcherProcessor sm;
+ WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
/**
* Gets an expression and tries to find it on the web. If search results are
@@ -40,19 +41,19 @@ public class SpeechRecognitionResultsPro
* results ate not similar to this phrase, we conclude that the phrase is
* meaningless (does not make sense, nobody has ever said something like that)
*
- * @param resp
- * BingResponse, search results for a phrase being assesses with
+ * @param hits
+ * list of search results for a phrase being assesses with
* respect to meaningfulness
* @param searchQuery
* the phrase we are assessing
* @return total similarity score for all search results
*/
- private double calculateTotalMatchScoreForHits(BingResponse resp,
+ private double calculateTotalMatchScoreForHits(List<HitBase> hits,
String searchQuery) {
sm = ParserChunker2MatcherProcessor.getInstance();
double totalMatchScore = 0;
- for (HitBase hit : resp.getHits()) {
+ for (HitBase hit : hits) {
String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
.replace("<b>", "").replace("</b>", "");
snapshot = snapshot.replace("</B>", "").replace("<B>", "")
@@ -103,9 +104,8 @@ public class SpeechRecognitionResultsPro
BingResponse resp = null, // obtained from bing
newResp = null; // re-sorted based on similarity
try {
- List<String> resultList = search(sentence, "", "", 10);
- resp = populateBingHit(resultList.get(0));
- double scoreForSentence = calculateTotalMatchScoreForHits(resp,
+ List<HitBase> resultList = scraper.runSearch(sentence);
+ double scoreForSentence = calculateTotalMatchScoreForHits(resultList,
sentence);
System.out.println("Total meaningfulness score = " + scoreForSentence
+ " for sentence = " + sentence);
Copied: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java (from r1362702, opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java?p2=opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java&p1=opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java&r1=1362702&r2=1387703&rev=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java Wed Sep 19 18:46:31 2012
@@ -28,9 +28,9 @@ import java.util.Set;
import org.apache.commons.lang.StringUtils;
-public class BingSearchResultsScraper {
+public class WebSearchEngineResultsScraper {
- protected static String fetchPageBing(String url) {
+ protected static String fetchPageSearchEngine(String url) {
System.out.println("fetch url " + url);
String pageContent = null;
StringBuffer buf = new StringBuffer();
@@ -62,7 +62,7 @@ public class BingSearchResultsScraper {
return buf.toString();
}
- private static List<String> extractURLesFromPage(String content, String domain) {
+ private static List<String> extractURLsFromPage(String content, String domain) {
List<String> results = new ArrayList<String>();
if (content == null)
return results;
@@ -87,29 +87,77 @@ public class BingSearchResultsScraper {
return results;
}
- private static String formRequestURL(String seedURL) {
- String requestUrl = "http://www.bing.com/search?q=site:" + seedURL;
+ private static List<HitBase> extractSearchResultFromPage(String content) {
+ List<HitBase> results = new ArrayList<HitBase>();
+ if (content == null)
+ return results;
+ content = StringUtils.substringBetween(content, "<div id=\"results",
+ "class=\"pagination");
+ if (content == null)
+ return results;
+ String[] srchResArea = content.split("</p>");
+ if (srchResArea == null)
+ return results;
+ for (String u : srchResArea) {
+ try {
+ u = u.substring(5);
+ HitBase hit = new HitBase();
+ String url = StringUtils.substringBetween(u, "class=\"url", "</span>");
+ if (url!=null)
+ url = url.substring(2);
+ String title = StringUtils.substringBetween(u, "\">", "</a><br />");
+ title = title.substring(title.indexOf("\">")+2);
+ String abstr = StringUtils.substringBetween(u, "\"body\">", "</span><br /");
+ hit.setUrl(url);
+ hit.setAbstractText(abstr);
+ hit.setTitle(title);
+ results.add(hit);
+ } catch (Exception e) {
+ //problem parsing SERP page; source - specific problem so we swallow exceptions here
+ }
+ }
+
+ return results;
+ }
+
+ private static String formRequestURL(String query) {
+ String requestUrl = "http://www.hakia.com/search/web?q=" + query.replace(' ','+');
return requestUrl;
}
public List<String> getURLsForWebDomain(String domain) {
- return extractURLesFromPage(fetchPageBing(formRequestURL(domain)), domain);
+ return extractURLsFromPage(fetchPageSearchEngine(formRequestURL(domain)), domain);
}
public Set<String> getURLsForWebDomainIterations(String domain) {
List<String> results = new ArrayList<String>();
- List<String> res = extractURLesFromPage(
- fetchPageBing(formRequestURL(domain)), domain);
+ List<String> res = extractURLsFromPage(
+ fetchPageSearchEngine(formRequestURL(domain)), domain);
for (String r : res)
- results.addAll(extractURLesFromPage(fetchPageBing(formRequestURL(r)), r));
+ results.addAll(extractURLsFromPage(fetchPageSearchEngine(formRequestURL(r)), r));
return new HashSet<String>(results);
}
+
+ public List<HitBase> runSearch(String query) {
+ List<HitBase> hits = new ArrayList<HitBase>();
+ try {
+ String serp = fetchPageSearchEngine(formRequestURL(query));
+ hits = extractSearchResultFromPage(serp);
+
+ } catch (Exception e) {
+
+ return hits;
+ }
+
+ hits = HitBase.removeDuplicates(hits);
+ return hits;
+ }
public static void main(String[] args) {
- System.out.println(new BingSearchResultsScraper()
- .getURLsForWebDomainIterations("www.sfgate.com/entertainment/"));
+ WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
+ System.out.println(scraper.runSearch("lady gaga in san francisco"));
}
}
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java Wed Sep 19 18:46:31 2012
@@ -17,6 +17,10 @@
package opennlp.tools.similarity.apps.utils;
+import java.util.Comparator;
+
+import opennlp.tools.similarity.apps.HitBase;
+
/**
* Generic pair class for holding two objects. Often used as return object.
*
@@ -54,4 +58,19 @@ public class Pair<T1, T2> {
public void setSecond(T2 second) {
this.second = second;
}
+
+ public class PairComparable implements Comparator<Pair<T1, T2>> {
+ // @Override
+ public int compare(Pair o1, Pair o2) {
+ int b = -2;
+ if ( o1.second instanceof Float && o2.second instanceof Float){
+
+ b = (((Float)o1.second > (Float)o2.second) ? -1
+ : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+ }
+ return b;
+ }
+ }
+
}
+
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java Wed Sep 19 18:46:31 2012
@@ -120,7 +120,8 @@ public class ParserCacheSerializer {
+ parseCacheFileNameCSV), ',');
lines = reader.readAll();
} catch (FileNotFoundException e) {
- e.printStackTrace();
+ //e.printStackTrace();
+ System.err.println("Cannot find cache file");
return null;
} catch (IOException ioe) {
ioe.printStackTrace();
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Wed Sep 19 18:46:31 2012
@@ -117,8 +117,9 @@ public class ParserChunker2MatcherProces
initializePosTagger();
initializeParser();
initializeChunker();
- } catch (Exception e) {
- LOG.fine("model cant be read and we rely on cache");
+ } catch (Exception e) { // a typical error when 'model' is not installed
+ System.err.println("Please install OpenNLP model files in 'src/test/resources' (folder 'model'");
+ LOG.fine("The model can't be read and we rely on cache");
}
}