You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2010/10/01 20:29:43 UTC

svn commit: r1003608 - in /nutch/trunk: ./ conf/ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/

Author: dogacan
Date: Fri Oct  1 18:29:42 2010
New Revision: 1003608

URL: http://svn.apache.org/viewvc?rev=1003608&view=rev
Log:
NUTCH-894 - Move statistical language identification from indexing to parsing step. Patch contributed by Sertan Alkan.

Removed:
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/da.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/de.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ee.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/el.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/en.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/es.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fi.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fr.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/hu.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/is.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/it.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/nl.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/no.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pl.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pt.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ru.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/sv.ngp
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/th.ngp
    nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
    nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestNGramProfile.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
    nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
    nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Oct  1 18:29:42 2010
@@ -2,6 +2,9 @@ Nutch Change Log
 
 Release 2.0 - Current Development
 
+* NUTCH-894 Move statistical language identification from indexing to parsing step
+  (Sertan Alkan via dogacan)
+
 * NUTCH-901 Make index-more plug-in configurable (Markus Jelsma via mattmann)
 
 * NUTCH-862 HttpClient null pointer exception (Sebastian Nagel via ab)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Oct  1 18:29:42 2010
@@ -993,6 +993,23 @@
   </description>
 </property>
 
+<property>
+  <name>lang.extraction.policy</name>
+  <value>detect,identify</value>
+  <description>This determines when the plugin uses detection and
+  statistical identification mechanisms. The order in which the
+  detect and identify are written will determine the extraction
+  policy. Default case (detect,identify)  means the plugin will
+  first try to extract language info from page headers and metadata,
+  if this is not successful it will try using tika language
+  identification. Possible values are:
+    detect
+    identify
+    detect,identify
+    identify,detect
+  </description>
+</property>
+
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>

Modified: nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Oct  1 18:29:42 2010
@@ -30,6 +30,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.ParseFilter;
 import org.apache.nutch.parse.Parse;
@@ -37,6 +38,7 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.NodeWalker;
+import org.apache.tika.language.LanguageIdentifier;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -52,14 +54,16 @@ public class HTMLLanguageParser implemen
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
+  private int detect = -1, identify = -1;
+
   /* A static Map of ISO-639 language codes */
-  private static Map LANGUAGES_MAP = new HashMap();
+  private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>();
   static {
     try {
       Properties p = new Properties();
       p.load(HTMLLanguageParser.class
           .getResourceAsStream("langmappings.properties"));
-      Enumeration keys = p.keys();
+      Enumeration<?> keys = p.keys();
       while (keys.hasMoreElements()) {
         String key = (String) keys.nextElement();
         String[] values = p.getProperty(key).split(",", -1);
@@ -85,26 +89,82 @@ public class HTMLLanguageParser implemen
    * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
    * -html.shtml#language) <li>3. meta http-equiv (content-language)
    * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
-   * Only the first occurence of language is stored.
    */
   public Parse filter(String url, WebPage page, Parse parse,
       HTMLMetaTags metaTags, DocumentFragment doc) {
     String lang = null;
+
+    if (detect >= 0 && identify < 0) {
+      lang = detectLanguage(page, doc);
+    } else if (detect < 0 && identify >= 0) {
+      lang = identifyLanguage(parse);
+    } else if (detect < identify) {
+      lang = detectLanguage(page, doc);
+      if (lang == null) {
+        lang = identifyLanguage(parse);
+      }
+    } else if (identify < detect) {
+      lang = identifyLanguage(parse);
+      if (lang == null) {
+        lang = detectLanguage(page, doc);
+      }
+    } else {
+      LOG.warn("No configuration for language extraction policy is provided");
+      return parse;
+    }
+
+    if (lang != null) {
+      page.putToMetadata(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang
+          .getBytes()));
+      return parse;
+    }
+
+    return parse;
+  }
+
+  /** Try to find the document's language from page headers and metadata */
+  private String detectLanguage(WebPage page, DocumentFragment doc) {
+    String lang = null;
     ByteBuffer blang = getLanguageFromMetadata(page.getMetadata());
     if (blang == null) {
-      // Trying to find the document's language
       LanguageParser parser = new LanguageParser(doc);
       lang = parser.getLanguage();
     } else
       lang = Bytes.toString(blang.array());
 
     if (lang != null) {
-      // parse..getParseMeta().set(Metadata.LANGUAGE, lang);
-      // TODO where to we store it? in parse or doc?
-      page.putToMetadata(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang
-          .getBytes()));
+      return lang;
     }
-    return parse;
+
+    Utf8 ulang = page.getFromHeaders(new Utf8(Response.CONTENT_LANGUAGE));
+    if (ulang != null) {
+      lang = ulang.toString();
+    }
+
+    return lang;
+  }
+
+  /** Use statistical language identification to extract page language */
+  private String identifyLanguage(Parse parse) {
+    StringBuilder text = new StringBuilder();
+    if (parse != null) {
+      String title = parse.getTitle();
+      if (title != null) {
+        text.append(title.toString());
+      }
+
+      String content = parse.getText();
+      if (content != null) {
+       text.append(" ").append(content.toString());
+      }
+
+      LanguageIdentifier identifier = new LanguageIdentifier(text.toString());
+
+      if (identifier.isReasonablyCertain()) {
+        return identifier.getLanguage();
+      }
+    }
+    return null;
   }
 
   // Check in the metadata whether the language has already been stored there by
@@ -157,8 +217,6 @@ public class HTMLLanguageParser implemen
         String nodeName = currentNode.getNodeName();
         short nodeType = currentNode.getNodeType();
 
-        String lang = null;
-
         if (nodeType == Node.ELEMENT_NODE) {
 
           // Check for the lang HTML attribute
@@ -244,6 +302,14 @@ public class HTMLLanguageParser implemen
 
   public void setConf(Configuration conf) {
     this.conf = conf;
+    String[] policy = conf.getStrings("lang.extraction.policy");
+    for (int i = 0; i < policy.length; i++) {
+      if (policy[i].equals("detect")) {
+        detect = i;
+      } else if (policy[i].equals("identify")) {
+        identify = i;
+      }
+    }
   }
 
   public Configuration getConf() {

Modified: nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Oct  1 18:29:42 2010
@@ -27,31 +27,24 @@ import org.apache.nutch.indexer.Indexing
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.Bytes;
 
 /**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that add a
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds a
  * <code>lang</code> (language) field to the document.
- * 
- * It tries to find the language of the document by:
- * <ul>
- * <li>First, checking if {@link HTMLLanguageParser} add some language
- * information</li>
- * <li>Then, checking if a <code>Content-Language</code> HTTP header can be
- * found</li>
- * <li>Finaly by analyzing the document content</li>
- * </ul>
- * 
+ *
+ * It tries to find the language of the document by checking
+ * if {@link HTMLLanguageParser} has added some language
+ * information
+ *
  * @author Sami Siren
  * @author Jerome Charron
  */
 public class LanguageIndexingFilter implements IndexingFilter {
 
   private Configuration conf;
-  private LanguageIdentifier languageIdentifier;
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -65,9 +58,7 @@ public class LanguageIndexingFilter impl
   /**
    * Constructs a new Language Indexing Filter.
    */
-  public LanguageIndexingFilter() {
-
-  }
+  public LanguageIndexingFilter() {}
 
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
@@ -79,21 +70,6 @@ public class LanguageIndexingFilter impl
       lang = Bytes.toString(blang.array());
     }
 
-    // check if HTTP-header tells us the language
-    if (lang == null) {
-      Utf8 ulang = page.getFromHeaders(new Utf8(Response.CONTENT_LANGUAGE));
-      if (ulang != null) {
-        lang = ulang.toString();
-      }
-    }
-
-    if (lang == null) {
-      StringBuilder text = new StringBuilder();
-      text.append(page.getTitle().toString()).append(" ").append(
-          page.getText().toString());
-      lang = this.languageIdentifier.identify(text);
-    }
-
     if (lang == null) {
       lang = "unknown";
     }
@@ -112,7 +88,6 @@ public class LanguageIndexingFilter impl
 
   public void setConf(Configuration conf) {
     this.conf = conf;
-    this.languageIdentifier = new LanguageIdentifier(conf);
   }
 
   public Configuration getConf() {

Modified: nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Fri Oct  1 18:29:42 2010
@@ -17,6 +17,8 @@
 package org.apache.nutch.analysis.lang;
 
 // JUnit imports
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
 import java.nio.ByteBuffer;
 
 import junit.framework.TestCase;
@@ -28,6 +30,7 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.EncodingDetector;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.language.LanguageIdentifier;
 
 public class TestHTMLLanguageParser extends TestCase {
 
@@ -95,6 +98,50 @@ public class TestHTMLLanguageParser exte
     }
   }
 
+  public void testLanguageIndentifier() {
+    try {
+      long total = 0;
+      LanguageIdentifier identifier;
+      BufferedReader in = new BufferedReader(new InputStreamReader(this
+          .getClass().getResourceAsStream("test-referencial.txt")));
+      String line = null;
+      while ((line = in.readLine()) != null) {
+        String[] tokens = line.split(";");
+        if (!tokens[0].equals("")) {
+          StringBuilder content = new StringBuilder();
+          // Test each line of the file...
+          BufferedReader testFile = new BufferedReader(new InputStreamReader(
+              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
+          String testLine = null, lang = null;
+          while ((testLine = testFile.readLine()) != null) {
+            content.append(testLine + "\n");
+            testLine = testLine.trim();
+            if (testLine.length() > 256) {
+              identifier = new LanguageIdentifier(testLine);
+              lang = identifier.getLanguage();
+              assertEquals(tokens[1], lang);
+            }
+          }
+          testFile.close();
+
+          // Test the whole file
+          long start = System.currentTimeMillis();
+          System.out.println(content.toString());
+          identifier = new LanguageIdentifier(content.toString());
+          lang = identifier.getLanguage();
+          System.out.println(lang);
+          total += System.currentTimeMillis() - start;
+          assertEquals(tokens[1], lang);
+        }
+      }
+      in.close();
+      System.out.println("Total Time=" + total);
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.toString());
+    }
+  }
+
   private WebPage getPage(String text) {
     WebPage page = new WebPage();
     page.setBaseUrl(BASE);