You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/09/19 20:46:32 UTC

svn commit: r1387703 - in /opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools: similarity/apps/ similarity/apps/utils/ textsimilarity/chunker2matcher/

Author: bgalitsky
Date: Wed Sep 19 18:46:31 2012
New Revision: 1387703

URL: http://svn.apache.org/viewvc?rev=1387703&view=rev
Log:
OPENNLP-537: make an access to generic search engines to demonstrate search results re-ranking

Added:
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java
      - copied, changed from r1362702, opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
Removed:
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
Modified:
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/   (props changed)
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java

Propchange: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Sep 19 18:46:31 2012
@@ -0,0 +1 @@
+relevance_verifier

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java Wed Sep 19 18:46:31 2012
@@ -31,7 +31,8 @@ import org.json.JSONArray;
 import org.json.JSONObject;
 
 public class BingQueryRunner {
-  protected static final String APP_ID = "DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
+  protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+    //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
   // TODO user needs to have own APP_ID from Bing API
 
   private float snapshotSimilarityThreshold = 0.4f;

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java Wed Sep 19 18:46:31 2012
@@ -42,13 +42,14 @@ public class BingWebQueryRunner {
       throws Exception {
     String codedQuery = URLEncoder.encode(query, "UTF-8");
 
-    String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
-        + BingQueryRunner.APP_ID + "&query=" + codedQuery 
-        + "&Sources=Web"
+    String yahooRequest = "https://api.datamarket.azure.com/Bing/SearchWeb"
+     // "http://api.search.live.net/json.aspx?Appid="
+        + BingQueryRunner.APP_ID + "&Query=" + codedQuery ;
+      //  + "&Sources=Web"
         // Common request fields (optional)
-        + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
+       // + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
          // News-specific request fields (optional)
-        + "&News.Offset=0";
+      //  + "&News.Offset=0";
 
     return yahooRequest;
   }

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java Wed Sep 19 18:46:31 2012
@@ -33,25 +33,20 @@ public class SearchResultsProcessor exte
   ParserChunker2MatcherProcessor sm;
 
   /*
-   * Takes Bing API search results and calculates the parse tree similarity
+   * Takes a search engine API (or scraped) search results and calculates the parse tree similarity
    * between the question and each snippet. Ranks those snippets with higher
    * similarity score up
    */
-  private BingResponse calculateMatchScoreResortHits(BingResponse resp,
+  
+  
+  private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,
       String searchQuery) {
-    // TODO
-    /*
-     * if query is multi-sentence, special handling int indexDot =
-     * searchQuery.indexOf("."); if (indexDot>0 &&
-     * indexDot<searchQuery.length()-1){ MultipleSentenceQueryAnswerer ans = new
-     * MultipleSentenceQueryAnswerer(); return
-     * ans.calculateMatchScoreResortHits(resp, searchQuery); }
-     */
+
     List<HitBase> newHitList = new ArrayList<HitBase>();
     sm = ParserChunker2MatcherProcessor.getInstance();
 
-    for (HitBase hit : resp.getHits()) {
-      String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
+    for (HitBase hit : hits) {
+      String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")
           .replace("<b>", "").replace("</b>", "");
       snapshot = snapshot.replace("</B>", "").replace("<B>", "")
           .replace("<br>", "").replace("</br>", "").replace("...", ". ")
@@ -72,13 +67,13 @@ public class SearchResultsProcessor exte
       newHitList.add(hit);
     }
     Collections.sort(newHitList, new HitBaseComparable());
-    resp.setHits(newHitList);
+   
     LOG.info("\n\n ============= NEW ORDER ================= ");
     for (HitBase hit : newHitList) {
       LOG.info(hit.toString());
     }
 
-    return resp;
+    return newHitList;
   }
 
   public void close() {
@@ -86,13 +81,21 @@ public class SearchResultsProcessor exte
   }
 
   public List<HitBase> runSearch(String query) {
+    
+    WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
+    List<HitBase> hits = scraper.runSearch(query);
+    hits = calculateMatchScoreResortHits(hits, query);
+    return hits;
+  }
+  
+  public List<HitBase> runSearchViaAPI(String query) {
     BingResponse resp = null, // obtained from bing
     newResp = null; // re-sorted based on similarity
     try {
       List<String> resultList = search(query, "", "", 30);
       resp = populateBingHit(resultList.get(0));
       // now we apply our own relevance filter
-      newResp = calculateMatchScoreResortHits(resp, query);
+      newResp.setHits(calculateMatchScoreResortHits(resp.getHits(), query));
 
     } catch (Exception e) {
       // e.printStackTrace();

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java Wed Sep 19 18:46:31 2012
@@ -27,11 +27,12 @@ import opennlp.tools.textsimilarity.Pars
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
-public class SpeechRecognitionResultsProcessor extends BingWebQueryRunner {
+public class SpeechRecognitionResultsProcessor /*extends BingWebQueryRunner*/ {
   private static Logger LOG = Logger
       .getLogger("opennlp.tools.similarity.apps.SpeechRecognitionResultsProcessor");
   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
   ParserChunker2MatcherProcessor sm;
+  WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
 
   /**
    * Gets an expression and tries to find it on the web. If search results are
@@ -40,19 +41,19 @@ public class SpeechRecognitionResultsPro
    * results ate not similar to this phrase, we conclude that the phrase is
    * meaningless (does not make sense, nobody has ever said something like that)
    * 
-   * @param resp
-   *          BingResponse, search results for a phrase being assesses with
+   * @param  hits
+   *          list of search results for a phrase being assesses with
    *          respect to meaningfulness
    * @param searchQuery
    *          the phrase we are assessing
    * @return total similarity score for all search results
    */
-  private double calculateTotalMatchScoreForHits(BingResponse resp,
+  private double calculateTotalMatchScoreForHits(List<HitBase> hits,
       String searchQuery) {
 
     sm = ParserChunker2MatcherProcessor.getInstance();
     double totalMatchScore = 0;
-    for (HitBase hit : resp.getHits()) {
+    for (HitBase hit : hits) {
       String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
           .replace("<b>", "").replace("</b>", "");
       snapshot = snapshot.replace("</B>", "").replace("<B>", "")
@@ -103,9 +104,8 @@ public class SpeechRecognitionResultsPro
       BingResponse resp = null, // obtained from bing
       newResp = null; // re-sorted based on similarity
       try {
-        List<String> resultList = search(sentence, "", "", 10);
-        resp = populateBingHit(resultList.get(0));
-        double scoreForSentence = calculateTotalMatchScoreForHits(resp,
+        List<HitBase> resultList = scraper.runSearch(sentence);
+        double scoreForSentence = calculateTotalMatchScoreForHits(resultList,
             sentence);
         System.out.println("Total meaningfulness score = " + scoreForSentence
             + " for sentence = " + sentence);

Copied: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java (from r1362702, opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java)
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java?p2=opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java&p1=opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java&r1=1362702&r2=1387703&rev=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java Wed Sep 19 18:46:31 2012
@@ -28,9 +28,9 @@ import java.util.Set;
 
 import org.apache.commons.lang.StringUtils;
 
-public class BingSearchResultsScraper {
+public class WebSearchEngineResultsScraper {
 
-  protected static String fetchPageBing(String url) {
+  protected static String fetchPageSearchEngine(String url) {
     System.out.println("fetch url " + url);
     String pageContent = null;
     StringBuffer buf = new StringBuffer();
@@ -62,7 +62,7 @@ public class BingSearchResultsScraper {
     return buf.toString();
   }
 
-  private static List<String> extractURLesFromPage(String content, String domain) {
+  private static List<String> extractURLsFromPage(String content, String domain) {
     List<String> results = new ArrayList<String>();
     if (content == null)
       return results;
@@ -87,29 +87,77 @@ public class BingSearchResultsScraper {
     return results;
   }
 
-  private static String formRequestURL(String seedURL) {
-    String requestUrl = "http://www.bing.com/search?q=site:" + seedURL;
+  private static List<HitBase> extractSearchResultFromPage(String content) {
+    List<HitBase> results = new ArrayList<HitBase>();
+    if (content == null)
+      return results;
+    content = StringUtils.substringBetween(content, "<div id=\"results",
+        "class=\"pagination");
+    if (content == null)
+      return results;
+    String[] srchResArea = content.split("</p>");
+    if (srchResArea == null)
+      return results;
+    for (String u : srchResArea) {
+      try {
+        u = u.substring(5);
+        HitBase hit = new HitBase();
+        String url = StringUtils.substringBetween(u, "class=\"url", "</span>");
+        if (url!=null)
+            url = url.substring(2);
+        String title = StringUtils.substringBetween(u, "\">", "</a><br />");
+        title = title.substring(title.indexOf("\">")+2);
+        String abstr = StringUtils.substringBetween(u, "\"body\">", "</span><br /");
+        hit.setUrl(url);
+        hit.setAbstractText(abstr);
+        hit.setTitle(title);
+        results.add(hit);
+      } catch (Exception e) {
+        //problem parsing SERP page; source - specific problem so we swallow exceptions here
+      }
+    }
+
+    return results;
+  }
+  
+  private static String formRequestURL(String query) {
+    String requestUrl = "http://www.hakia.com/search/web?q=" + query.replace(' ','+');
 
     return requestUrl;
   }
 
   public List<String> getURLsForWebDomain(String domain) {
-    return extractURLesFromPage(fetchPageBing(formRequestURL(domain)), domain);
+    return extractURLsFromPage(fetchPageSearchEngine(formRequestURL(domain)), domain);
   }
 
   public Set<String> getURLsForWebDomainIterations(String domain) {
     List<String> results = new ArrayList<String>();
-    List<String> res = extractURLesFromPage(
-        fetchPageBing(formRequestURL(domain)), domain);
+    List<String> res = extractURLsFromPage(
+        fetchPageSearchEngine(formRequestURL(domain)), domain);
     for (String r : res)
-      results.addAll(extractURLesFromPage(fetchPageBing(formRequestURL(r)), r));
+      results.addAll(extractURLsFromPage(fetchPageSearchEngine(formRequestURL(r)), r));
 
     return new HashSet<String>(results);
   }
+  
+  public List<HitBase> runSearch(String query) {
+    List<HitBase> hits = new ArrayList<HitBase>();
+    try {
+      String serp = fetchPageSearchEngine(formRequestURL(query));
+      hits = extractSearchResultFromPage(serp);
+
+    } catch (Exception e) {
+     
+      return hits;
+    }
+ 
+    hits = HitBase.removeDuplicates(hits);
+    return hits;
+  }
 
   public static void main(String[] args) {
-    System.out.println(new BingSearchResultsScraper()
-        .getURLsForWebDomainIterations("www.sfgate.com/entertainment/"));
+    WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
+    System.out.println(scraper.runSearch("lady gaga in san francisco"));        
   }
 
 }

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java Wed Sep 19 18:46:31 2012
@@ -17,6 +17,10 @@
 
 package opennlp.tools.similarity.apps.utils;
 
+import java.util.Comparator;
+
+import opennlp.tools.similarity.apps.HitBase;
+
 /**
  * Generic pair class for holding two objects. Often used as return object.
  * 
@@ -54,4 +58,19 @@ public class Pair<T1, T2> {
   public void setSecond(T2 second) {
     this.second = second;
   }
+  
+  public class PairComparable implements Comparator<Pair<T1, T2>> {
+    // @Override
+    public int compare(Pair o1, Pair o2) {
+      int b = -2;
+      if ( o1.second instanceof Float && o2.second instanceof Float){
+        
+        b =  (((Float)o1.second > (Float)o2.second) ? -1
+          : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+      }
+      return b;
+    }
+  }
+  
 }
+

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java Wed Sep 19 18:46:31 2012
@@ -120,7 +120,8 @@ public class ParserCacheSerializer {
             + parseCacheFileNameCSV), ',');
         lines = reader.readAll();
       } catch (FileNotFoundException e) {
-        e.printStackTrace();
+        //e.printStackTrace();
+        System.err.println("Cannot find cache file");
         return null;
       } catch (IOException ioe) {
         ioe.printStackTrace();

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1387703&r1=1387702&r2=1387703&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Wed Sep 19 18:46:31 2012
@@ -117,8 +117,9 @@ public class ParserChunker2MatcherProces
       initializePosTagger();
       initializeParser();
       initializeChunker();
-    } catch (Exception e) {
-      LOG.fine("model cant be read and we rely on cache");
+    } catch (Exception e) { // a typical error when 'model' is not installed
+      System.err.println("Please install OpenNLP model files in 'src/test/resources' (folder 'model'");
+      LOG.fine("The model can't be read and we rely on cache");
     }
   }