You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2010/10/01 20:29:43 UTC
svn commit: r1003608 - in /nutch/trunk: ./ conf/
src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/
src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/
Author: dogacan
Date: Fri Oct 1 18:29:42 2010
New Revision: 1003608
URL: http://svn.apache.org/viewvc?rev=1003608&view=rev
Log:
NUTCH-894 - Move statistical language identification from indexing to parsing step. Patch contributed by Sertan Alkan.
Removed:
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/da.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/de.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ee.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/el.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/en.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/es.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fi.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fr.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/hu.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/is.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/it.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/nl.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/no.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pl.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pt.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ru.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/sv.ngp
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/th.ngp
nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestNGramProfile.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Oct 1 18:29:42 2010
@@ -2,6 +2,9 @@ Nutch Change Log
Release 2.0 - Current Development
+* NUTCH-894 Move statistical language identification from indexing to parsing step
+ (Sertan Alkan via dogacan)
+
* NUTCH-901 Make index-more plug-in configurable (Markus Jelsma via mattmann)
* NUTCH-862 HttpClient null pointer exception (Sebastian Nagel via ab)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Oct 1 18:29:42 2010
@@ -993,6 +993,23 @@
</description>
</property>
+<property>
+ <name>lang.extraction.policy</name>
+ <value>detect,identify</value>
+ <description>This determines when the plugin uses detection and
+ statistical identification mechanisms. The order in which the
+ detect and identify are written will determine the extraction
+ policy. Default case (detect,identify) means the plugin will
+ first try to extract language info from page headers and metadata,
+ if this is not successful it will try using tika language
+ identification. Possible values are:
+ detect
+ identify
+ detect,identify
+ identify,detect
+ </description>
+</property>
+
<!-- Temporary Hadoop 0.17.x workaround. -->
<property>
Modified: nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Oct 1 18:29:42 2010
@@ -30,6 +30,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.ParseFilter;
import org.apache.nutch.parse.Parse;
@@ -37,6 +38,7 @@ import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NodeWalker;
+import org.apache.tika.language.LanguageIdentifier;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
@@ -52,14 +54,16 @@ public class HTMLLanguageParser implemen
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+ private int detect = -1, identify = -1;
+
/* A static Map of ISO-639 language codes */
- private static Map LANGUAGES_MAP = new HashMap();
+ private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>();
static {
try {
Properties p = new Properties();
p.load(HTMLLanguageParser.class
.getResourceAsStream("langmappings.properties"));
- Enumeration keys = p.keys();
+ Enumeration<?> keys = p.keys();
while (keys.hasMoreElements()) {
String key = (String) keys.nextElement();
String[] values = p.getProperty(key).split(",", -1);
@@ -85,26 +89,82 @@ public class HTMLLanguageParser implemen
* (http://dublincore.org/documents/2000/07/16/usageguide/qualified
* -html.shtml#language) <li>3. meta http-equiv (content-language)
* (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
- * Only the first occurence of language is stored.
*/
public Parse filter(String url, WebPage page, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
String lang = null;
+
+ if (detect >= 0 && identify < 0) {
+ lang = detectLanguage(page, doc);
+ } else if (detect < 0 && identify >= 0) {
+ lang = identifyLanguage(parse);
+ } else if (detect < identify) {
+ lang = detectLanguage(page, doc);
+ if (lang == null) {
+ lang = identifyLanguage(parse);
+ }
+ } else if (identify < detect) {
+ lang = identifyLanguage(parse);
+ if (lang == null) {
+ lang = detectLanguage(page, doc);
+ }
+ } else {
+ LOG.warn("No configuration for language extraction policy is provided");
+ return parse;
+ }
+
+ if (lang != null) {
+ page.putToMetadata(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang
+ .getBytes()));
+ return parse;
+ }
+
+ return parse;
+ }
+
+ /** Try to find the document's language from page headers and metadata */
+ private String detectLanguage(WebPage page, DocumentFragment doc) {
+ String lang = null;
ByteBuffer blang = getLanguageFromMetadata(page.getMetadata());
if (blang == null) {
- // Trying to find the document's language
LanguageParser parser = new LanguageParser(doc);
lang = parser.getLanguage();
} else
lang = Bytes.toString(blang.array());
if (lang != null) {
- // parse..getParseMeta().set(Metadata.LANGUAGE, lang);
- // TODO where to we store it? in parse or doc?
- page.putToMetadata(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang
- .getBytes()));
+ return lang;
}
- return parse;
+
+ Utf8 ulang = page.getFromHeaders(new Utf8(Response.CONTENT_LANGUAGE));
+ if (ulang != null) {
+ lang = ulang.toString();
+ }
+
+ return lang;
+ }
+
+ /** Use statistical language identification to extract page language */
+ private String identifyLanguage(Parse parse) {
+ StringBuilder text = new StringBuilder();
+ if (parse != null) {
+ String title = parse.getTitle();
+ if (title != null) {
+ text.append(title.toString());
+ }
+
+ String content = parse.getText();
+ if (content != null) {
+ text.append(" ").append(content.toString());
+ }
+
+ LanguageIdentifier identifier = new LanguageIdentifier(text.toString());
+
+ if (identifier.isReasonablyCertain()) {
+ return identifier.getLanguage();
+ }
+ }
+ return null;
}
// Check in the metadata whether the language has already been stored there by
@@ -157,8 +217,6 @@ public class HTMLLanguageParser implemen
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
- String lang = null;
-
if (nodeType == Node.ELEMENT_NODE) {
// Check for the lang HTML attribute
@@ -244,6 +302,14 @@ public class HTMLLanguageParser implemen
public void setConf(Configuration conf) {
this.conf = conf;
+ String[] policy = conf.getStrings("lang.extraction.policy");
+ for (int i = 0; i < policy.length; i++) {
+ if (policy[i].equals("detect")) {
+ detect = i;
+ } else if (policy[i].equals("identify")) {
+ identify = i;
+ }
+ }
}
public Configuration getConf() {
Modified: nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Oct 1 18:29:42 2010
@@ -27,31 +27,24 @@ import org.apache.nutch.indexer.Indexing
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.Bytes;
/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that add a
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds a
* <code>lang</code> (language) field to the document.
- *
- * It tries to find the language of the document by:
- * <ul>
- * <li>First, checking if {@link HTMLLanguageParser} add some language
- * information</li>
- * <li>Then, checking if a <code>Content-Language</code> HTTP header can be
- * found</li>
- * <li>Finaly by analyzing the document content</li>
- * </ul>
- *
+ *
+ * It tries to find the language of the document by checking
+ * if {@link HTMLLanguageParser} has added some language
+ * information
+ *
* @author Sami Siren
* @author Jerome Charron
*/
public class LanguageIndexingFilter implements IndexingFilter {
private Configuration conf;
- private LanguageIdentifier languageIdentifier;
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -65,9 +58,7 @@ public class LanguageIndexingFilter impl
/**
* Constructs a new Language Indexing Filter.
*/
- public LanguageIndexingFilter() {
-
- }
+ public LanguageIndexingFilter() {}
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
@@ -79,21 +70,6 @@ public class LanguageIndexingFilter impl
lang = Bytes.toString(blang.array());
}
- // check if HTTP-header tells us the language
- if (lang == null) {
- Utf8 ulang = page.getFromHeaders(new Utf8(Response.CONTENT_LANGUAGE));
- if (ulang != null) {
- lang = ulang.toString();
- }
- }
-
- if (lang == null) {
- StringBuilder text = new StringBuilder();
- text.append(page.getTitle().toString()).append(" ").append(
- page.getText().toString());
- lang = this.languageIdentifier.identify(text);
- }
-
if (lang == null) {
lang = "unknown";
}
@@ -112,7 +88,6 @@ public class LanguageIndexingFilter impl
public void setConf(Configuration conf) {
this.conf = conf;
- this.languageIdentifier = new LanguageIdentifier(conf);
}
public Configuration getConf() {
Modified: nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1003608&r1=1003607&r2=1003608&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Fri Oct 1 18:29:42 2010
@@ -17,6 +17,8 @@
package org.apache.nutch.analysis.lang;
// JUnit imports
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import junit.framework.TestCase;
@@ -28,6 +30,7 @@ import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.language.LanguageIdentifier;
public class TestHTMLLanguageParser extends TestCase {
@@ -95,6 +98,50 @@ public class TestHTMLLanguageParser exte
}
}
+ public void testLanguageIndentifier() {
+ try {
+ long total = 0;
+ LanguageIdentifier identifier;
+ BufferedReader in = new BufferedReader(new InputStreamReader(this
+ .getClass().getResourceAsStream("test-referencial.txt")));
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ String[] tokens = line.split(";");
+ if (!tokens[0].equals("")) {
+ StringBuilder content = new StringBuilder();
+ // Test each line of the file...
+ BufferedReader testFile = new BufferedReader(new InputStreamReader(
+ this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
+ String testLine = null, lang = null;
+ while ((testLine = testFile.readLine()) != null) {
+ content.append(testLine + "\n");
+ testLine = testLine.trim();
+ if (testLine.length() > 256) {
+ identifier = new LanguageIdentifier(testLine);
+ lang = identifier.getLanguage();
+ assertEquals(tokens[1], lang);
+ }
+ }
+ testFile.close();
+
+ // Test the whole file
+ long start = System.currentTimeMillis();
+ System.out.println(content.toString());
+ identifier = new LanguageIdentifier(content.toString());
+ lang = identifier.getLanguage();
+ System.out.println(lang);
+ total += System.currentTimeMillis() - start;
+ assertEquals(tokens[1], lang);
+ }
+ }
+ in.close();
+ System.out.println("Total Time=" + total);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.toString());
+ }
+ }
+
private WebPage getPage(String text) {
WebPage page = new WebPage();
page.setBaseUrl(BASE);