You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/20 10:39:43 UTC

svn commit: r1291165 - in /nutch/branches/nutchgora: CHANGES.txt conf/nutch-default.xml src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java

Author: ferdy
Date: Mon Feb 20 09:39:42 2012
New Revision: 1291165

URL: http://svn.apache.org/viewvc?rev=1291165&view=rev
Log:
NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/conf/nutch-default.xml
    nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1291165&r1=1291164&r2=1291165&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon Feb 20 09:39:42 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release nutchgora - Current Development
 
+* NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy)
+
 * NUTCH-1246 Upgrade to Hadoop 1.0.0 (lewismc)
 
 * NUTCH-1279 Check if limit has been reached in GeneraterReducer must be the first check performance-wise. (ferdy)

Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1291165&r1=1291164&r2=1291165&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Mon Feb 20 09:39:42 2012
@@ -1026,6 +1026,15 @@
   </description>
 </property>
 
+<property>
+  <name>lang.identification.only.certain</name>
+  <value>false</value>
+  <description>If set to true with lang.extraction.policy containing identify,
+  the language code returned by Tika will be assigned to the document ONLY
+  if it is deemed certain by Tika.
+  </description>
+</property>
+
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>

Modified: nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1291165&r1=1291164&r2=1291165&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Mon Feb 20 09:39:42 2012
@@ -81,6 +81,8 @@ public class HTMLLanguageParser implemen
 
   private Configuration conf;
 
+  private boolean onlyCertain;
+
   /**
    * Scan the HTML document looking at possible indications of content language<br>
    * <li>1. html lang attribute
@@ -160,7 +162,11 @@ public class HTMLLanguageParser implemen
 
       LanguageIdentifier identifier = new LanguageIdentifier(text.toString());
 
-      if (identifier.isReasonablyCertain()) {
+      if (onlyCertain) {
+        if (identifier.isReasonablyCertain()) {
+          return identifier.getLanguage();
+        }
+      } else {
         return identifier.getLanguage();
       }
     }
@@ -302,6 +308,7 @@ public class HTMLLanguageParser implemen
 
   public void setConf(Configuration conf) {
     this.conf = conf;
+    onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
     String[] policy = conf.getStrings("lang.extraction.policy");
     for (int i = 0; i < policy.length; i++) {
       if (policy[i].equals("detect")) {