You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/08/13 14:28:23 UTC

svn commit: r1617712 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: ./ indexing/ scoring/

Author: markg
Date: Wed Aug 13 12:28:23 2014
New Revision: 1617712

URL: http://svn.apache.org/r1617712
Log:
OPENNLP-706
Addressed issues from Joern's code review, also made use of hierarchy configurable, as well as added boosting at index time to administrative boundary types and populated place types so that these hits are more heavily weighted in the index.

Removed:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java Wed Aug 13 12:28:23 2014
@@ -140,9 +140,7 @@ public class AdminBoundaryContextGenerat
    */
   private AdminBoundaryContext process(String text) {
     try {
-      if (text.contains("Convoy of terror")) {
-        System.out.println("");
-      }
+    
       reset();
       Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);
       if (!countryhitMap.isEmpty()) {
@@ -282,7 +280,7 @@ public class AdminBoundaryContextGenerat
         if (name == null) {
           continue;
         }
-        name = "[^\\p{L}\\p{Nd}]" + name.replace(", the", "") + "[^\\p{L}\\p{Nd}]";
+        name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") + "([^\\p{L}\\p{Nd}]|$)";
         Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
         Matcher rs = regex.matcher(docText);
         String code = entry.toLowerCase();

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Wed Aug 13 12:28:23 2014
@@ -38,7 +38,6 @@ import org.apache.lucene.util.Version;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.search.Sort;
 
 /**
  *
@@ -49,10 +48,13 @@ import org.apache.lucene.search.Sort;
  */
 public class GazetteerSearcher {
 
+  //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) ";
+
   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
   private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
   private double scoreCutoff = .70;
-  private boolean doubleQuoteAllSearchTerms = false;
+  private boolean doubleQuoteAllSearchTerms = true;
+  private boolean useHierarchyField = false;
   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
   private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
@@ -85,13 +87,16 @@ public class GazetteerSearcher {
     this.properties = properties;
     init();
   }
-/**
- * Searches the single lucene index that includes the location hierarchy.
- * @param searchString the location name to search for
- * @param rowsReturned how many index entries to return (top N...)
- * @param whereClause the conditional statement that defines the index type and the country oode.
- * @return 
- */
+
+  /**
+   * Searches the single lucene index that includes the location hierarchy.
+   *
+   * @param searchString the location name to search for
+   * @param rowsReturned how many index entries to return (top N...)
+   * @param whereClause the conditional statement that defines the index type
+   * and the country oode.
+   * @return
+   */
   public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
     searchString = cleanInput(searchString);
@@ -104,12 +109,11 @@ public class GazetteerSearcher {
        * case the code variables will be empty strings
        */
       String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;
-      if (searchString.trim().contains(" ")) {
+      if (searchString.trim().contains(" ") && useHierarchyField) {
         placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
                 + " AND " + whereClause;
       }
-
-      //  luceneQueryString = "hierarchy:(tampa florida) AND gazsource:usgs";
+       
       /**
        * check the cache and go no further if the records already exist
        */
@@ -123,14 +127,13 @@ public class GazetteerSearcher {
        */
       QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);
       Query q = parser.parse(placeNameQueryString);
-      
-      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned, Sort.RELEVANCE);
-  
+
+      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);
+
       for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
         GazetteerEntry entry = new GazetteerEntry();
         int docId = bestDocs.scoreDocs[i].doc;
         double sc = bestDocs.scoreDocs[i].score;
-
         entry.getScoreMap().put("lucene", sc);
         entry.setIndexID(docId + "");
 
@@ -165,23 +168,23 @@ public class GazetteerSearcher {
          */
         int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();
 
-        Double normLev = Math.abs(1-(sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();
+        Double normLev = Math.abs(1 - (sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();
         /**
          * only want hits above the levenstein thresh. This should be a low
          * thresh due to the use of the hierarchy field in the index
          */
-        if (normLev.compareTo(scoreCutoff) >= 0) {
-//          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {
-          entry.getScoreMap().put("normlucene", normLev);
-          //make sure we don't produce a duplicate
-          if (!linkedData.contains(entry)) {
-            linkedData.add(entry);
-            /**
-             * add the records to the cache for this query
-             */
-            GazetteerSearchCache.put(placeNameQueryString, linkedData);
+        if (normLev > scoreCutoff) {
+          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {
+            entry.getScoreMap().put("normlucene", normLev);
+            //make sure we don't produce a duplicate
+            if (!linkedData.contains(entry)) {
+              linkedData.add(entry);
+              /**
+               * add the records to the cache for this query
+               */
+              GazetteerSearchCache.put(placeNameQueryString, linkedData);
+            }
           }
-//          }
         }
       }
 
@@ -311,7 +314,7 @@ public class GazetteerSearcher {
    *
    * @return
    */
-    @Deprecated
+  @Deprecated
   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
     searchString = cleanInput(searchString);
@@ -406,6 +409,7 @@ public class GazetteerSearcher {
    */
   private String cleanInput(String input) {
     String output = input.replaceAll(REGEX_CLEAN, " ").trim();
+    output = output.replace("  ", " ");
     if (doubleQuoteAllSearchTerms) {
       return "\"" + output + "\"";
     } else {
@@ -415,56 +419,35 @@ public class GazetteerSearcher {
   }
 
   private void init() throws Exception {
-//    if (usgsIndex == null) {
-//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
-//      if (indexloc.equals("")) {
-//        // System.out.println("USGS Gaz location not found");
-//        LOGGER.error(new Exception("USGS Gaz location not found"));
-//      }
-//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
-//
-//      scoreCutoff = Double.valueOf(cutoff);
-//      String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
-//      doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
-//      usgsIndex = new MMapDirectory(new File(indexloc));
-//      usgsReader = DirectoryReader.open(usgsIndex);
-//      usgsSearcher = new IndexSearcher(usgsReader);
-//      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-//    }
-//    if (geonamesIndex == null) {
-//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
-//      if (indexloc.equals("")) {
-//        LOGGER.error(new Exception("Geonames Gaz location not found"));
-//
-//      }
-//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
-//      scoreCutoff = Double.valueOf(cutoff);
-//      geonamesIndex = new MMapDirectory(new File(indexloc));
-//      geonamesReader = DirectoryReader.open(geonamesIndex);
-//      geonamesSearcher = new IndexSearcher(geonamesReader);
-//      //TODO: a language code switch statement should be employed here at some point
-//      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-//
-//    }
+
     if (opennlpIndex == null) {
       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");
       if (indexloc.equals("")) {
         LOGGER.error(new Exception("Opennlp combined Gaz directory location not found"));
 
       }
-      //  String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
-      //  scoreCutoff = Double.valueOf(cutoff);
+
       opennlpIndex = new MMapDirectory(new File(indexloc));
       opennlpReader = DirectoryReader.open(opennlpIndex);
       opennlpSearcher = new IndexSearcher(opennlpReader);
       //TODO: a language code switch statement should be employed here at some point
       opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+      String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));
+      if (cutoff != null && !cutoff.isEmpty()) {
+        scoreCutoff = Double.valueOf(cutoff);
+      }
+      if (usehierarchy != null && !usehierarchy.isEmpty()) {
+        useHierarchyField = Boolean.valueOf(usehierarchy);
+      }
+      //  opennlp.geoentitylinker.gaz.doublequote=false
+      //opennlp.geoentitylinker.gaz.hierarchyfield=false
 
     }
   }
 
   private String formatForHierarchy(String searchTerm) {
-    String[] parts = searchTerm.split(" ");
+    String[] parts = cleanInput(searchTerm).split(" ");
     String out = "";
     if (parts.length != 0) {
       for (String string : parts) {

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Wed Aug 13 12:28:23 2014
@@ -15,6 +15,7 @@
  */
 package opennlp.addons.geoentitylinker;
 
+import java.io.IOException;
 import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
 import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
 import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
@@ -65,10 +66,12 @@ public class GeoEntityLinker implements 
           for (String whereclause : context.getWhereClauses()) {
             geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));
           }
-        }else{//this means there were no where clauses generated so the where clause will default to look at the entire index
-          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));
+        } else {//this means there were no where clauses generated so the where clause will default to look at the entire index
+          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:usgs geonames regions "));
+        }
+        if (geoNamesEntries.isEmpty()) {
+          continue;
         }
-        //start generating queries
         LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
         newspan.setSearchTerm(matches[i]);
         newspan.setLinkedEntries(geoNamesEntries);
@@ -93,19 +96,19 @@ public class GeoEntityLinker implements 
       scorers.add(new CountryProximityScorer());
       scorers.add(new ModelBasedScorer());
       scorers.add(new FuzzyStringMatchScorer());
-     // scorers.add(new ProvinceProximityScorer());
+      // scorers.add(new ProvinceProximityScorer());
     }
   }
 
   @Override
-  public void init(EntityLinkerProperties properties) {
+  public void init(EntityLinkerProperties properties) throws IOException {
     try {
       this.linkerProperties = properties;
       countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
       loadScorers();
     } catch (Exception ex) {
-      throw new RuntimeException(ex);
+      throw new IOException(ex);
     }
   }
 

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java Wed Aug 13 12:28:23 2014
@@ -20,7 +20,6 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -164,25 +163,30 @@ public class GeonamesProcessor {
         // System.out.println(line);
 
       }
-      writer.close();
+
     } catch (IOException ex) {
       ex.printStackTrace();
     }
     System.out.println("successfully wrote Geonames entries to country oontext file");
   }
 
- /**
-  * 
-  * @param gazateerInputData the Geonames allCounties.txt file
-  * @param type the types of gaz entry, usgs, geonames, or regions
-  * @param adms the province info
-  * @param countrycodes the country code info
-  * @param w the lucene index writer
-  * @throws Exception 
-  */
+  /**
+   *
+   * @param gazateerInputData the Geonames allCounties.txt file
+   * @param type the types of gaz entry, usgs, geonames, or regions
+   * @param adms the province info
+   * @param countrycodes the country code info
+   * @param w the lucene index writer
+   * @throws Exception
+   */
   public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {
 
     BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+    String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");
+    Map<String, Float> boostMap = new HashMap<>();
+    for (String boost : boosts) {
+      boostMap.put(boost.toLowerCase(), 10f);
+    }
     String[] fieldStrings = new String[]{
       "geonameid",
       "name",
@@ -225,7 +229,7 @@ public class GeonamesProcessor {
       String placeName = values[2];
       String lat = values[4];
       String lon = values[5];
-      String dsg = values[7];
+      String dsg = values[7].toLowerCase();
       String id = values[0];
       String concatIndexEntry = "";
       if (adm != null) {
@@ -255,13 +259,20 @@ public class GeonamesProcessor {
       doc.add(new TextField("placename", placeName, Field.Store.YES));
       doc.add(new TextField("latitude", lat, Field.Store.YES));
       doc.add(new TextField("longitude", lon, Field.Store.YES));
-      doc.add(new TextField("loctype", dsg, Field.Store.YES));
+      if (boostMap.containsKey(dsg)) {
+        TextField f = new TextField("loctype", dsg, Field.Store.YES);
+        f.setBoost(boostMap.get(dsg));
+        doc.add(f);
+      } else {
+        doc.add(new TextField("loctype", dsg, Field.Store.YES));
+      }
       doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
       doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));
       doc.add(new TextField("countycode", "", Field.Store.YES));
 
       doc.add(new TextField("locid", id, Field.Store.YES));
       doc.add(new TextField("gazsource", "geonames", Field.Store.YES));
+
       w.addDocument(doc);
 
       counter++;
@@ -272,7 +283,7 @@ public class GeonamesProcessor {
 
     }
 
-    System.out.println("Completed indexing gaz! index name is: " + type.toString());
+    System.out.println("Completed indexing geonames gaz! index name is: " + type.toString());
   }
 
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java Wed Aug 13 12:28:23 2014
@@ -21,10 +21,6 @@ import java.io.FileReader;
 import java.io.FileWriter;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.addons.geoentitylinker.AdminBoundary;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java Wed Aug 13 12:28:23 2014
@@ -179,7 +179,6 @@ public class USGSProcessor {
       ///  System.out.println(line);
 
       }
-      writer.close();
     } catch (IOException ex) {
       Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);
     }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java?rev=1617712&r1=1617711&r2=1617712&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java Wed Aug 13 12:28:23 2014
@@ -61,7 +61,7 @@ public class FuzzyStringMatchScorer impl
    * @return
    */
   public double getDiceCoefficient(String s1, String s2, int nGrams) {
-    if (s1.equals("") || s1.equals("")) {
+    if (s1.isEmpty() || s2.isEmpty()) {
       return 0d;
     }
     List<String> s1Grams = new ArrayList<>();