You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/20 10:39:43 UTC
svn commit: r1291165 - in /nutch/branches/nutchgora: CHANGES.txt
conf/nutch-default.xml
src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
Author: ferdy
Date: Mon Feb 20 09:39:42 2012
New Revision: 1291165
URL: http://svn.apache.org/viewvc?rev=1291165&view=rev
Log:
NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/conf/nutch-default.xml
nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1291165&r1=1291164&r2=1291165&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon Feb 20 09:39:42 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release nutchgora - Current Development
+* NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy)
+
* NUTCH-1246 Upgrade to Hadoop 1.0.0 (lewismc)
* NUTCH-1279 Check if limit has been reached in GeneraterReducer must be the first check performance-wise. (ferdy)
Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1291165&r1=1291164&r2=1291165&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Mon Feb 20 09:39:42 2012
@@ -1026,6 +1026,15 @@
</description>
</property>
+<property>
+ <name>lang.identification.only.certain</name>
+ <value>false</value>
+ <description>If set to true with lang.extraction.policy containing identify,
+ the language code returned by Tika will be assigned to the document ONLY
+ if it is deemed certain by Tika.
+ </description>
+</property>
+
<!-- Temporary Hadoop 0.17.x workaround. -->
<property>
Modified: nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1291165&r1=1291164&r2=1291165&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Mon Feb 20 09:39:42 2012
@@ -81,6 +81,8 @@ public class HTMLLanguageParser implemen
private Configuration conf;
+ private boolean onlyCertain;
+
/**
* Scan the HTML document looking at possible indications of content language<br>
* <li>1. html lang attribute
@@ -160,7 +162,11 @@ public class HTMLLanguageParser implemen
LanguageIdentifier identifier = new LanguageIdentifier(text.toString());
- if (identifier.isReasonablyCertain()) {
+ if (onlyCertain) {
+ if (identifier.isReasonablyCertain()) {
+ return identifier.getLanguage();
+ }
+ } else {
return identifier.getLanguage();
}
}
@@ -302,6 +308,7 @@ public class HTMLLanguageParser implemen
public void setConf(Configuration conf) {
this.conf = conf;
+ onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
String[] policy = conf.getStrings("lang.extraction.policy");
for (int i = 0; i < policy.length; i++) {
if (policy[i].equals("detect")) {