You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/08/26 16:54:21 UTC
svn commit: r240254 - in
/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang:
HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java
LanguageQueryFilter.java NGramProfile.java
Author: jerome
Date: Fri Aug 26 07:54:16 2005
New Revision: 240254
URL: http://svn.apache.org/viewcvs?rev=240254&view=rev
Log:
Javadoc updates, corrections on input stream reading
Modified:
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Aug 26 07:54:16 2005
@@ -23,20 +23,37 @@
import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
-/** Adds metadata identifying language of document if found
- * We could also run statistical analysis here but we'd miss all other formats
+/**
+ * An {@link org.apache.nutch.parse.HtmlParseFilter} that looks for possible
+ * indications of content language.
+ *
+ * If some indication is found, it is added in the {@link #META_LANG_NAME}
+ * attribute of the {@link org.apache.nutch.parse.ParseData} metadata.
+ *
+ * @author Sami Siren
+ * @author Jerome Charron
*/
public class HTMLLanguageParser implements HtmlParseFilter {
+
+ /** The language meta data attribute name */
public static final String META_LANG_NAME="X-meta-lang";
- public static final Logger LOG = LogFormatter
+
+ private static final Logger LOG = LogFormatter
.getLogger(HTMLLanguageParser.class.getName());
/**
- * Scan the HTML document looking at possible indications of content language<br>
- * <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
- * <li>2. meta dc.language (http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language)
- * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
- * <br>Only the first occurence of language is stored.
+ * Scan the HTML document looking at possible indications of content language.
+ * <ol>
+ * <li>html lang attribute
+ * (<a href="http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1">
+ * http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1</a>),</li>
+ * <li>meta dc.language (<a href="http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language">
+ * http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language</a>),</li>
+ * <li>meta http-equiv (content-language) (
+ * <a href="http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2">
+ * http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2</a>).</li>
+ * </ol>
+ * Only the first occurence of language is stored.
*/
public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
String lang = findLanguage(doc);
Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Fri Aug 26 07:54:16 2005
@@ -20,6 +20,7 @@
import java.io.InputStream;
import java.io.IOException;
import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.List;
@@ -48,6 +49,10 @@
/**
+ * Identify the language of a content, based on statistical analysis.
+ *
+ * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * Language Codes</a>.
*
* @author Sami Siren
* @author Jerome Charron
@@ -59,8 +64,8 @@
private final static float SCORE_THRESOLD = 0.00F;
- public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName());
-
+ private final static Logger LOG =
+ LogFormatter.getLogger(LanguageIdentifier.class.getName());
private ArrayList languages = new ArrayList();
@@ -168,7 +173,8 @@
}
/**
- * return handle to singleton instance
+ * Get a LanguageIdentifier instance.
+ * @return the LanguageIdentifier singleton instance.
*/
public static LanguageIdentifier getInstance() {
if (identifier == null) {
@@ -182,13 +188,25 @@
}
/**
- * main method used for testing
- *
- * @param args
+ * Main method used for command line process.
+ * <br/>Usage is:
+ * <pre>
+ * LanguageIdentifier [-identifyrows filename maxlines]
+ * [-identifyfile charset filename]
+ * [-identifyfileset charset files]
+ * [-identifytext text]
+ * [-identifyurl url]
+ * </pre>
+ * @param args arguments.
*/
public static void main(String args[]) {
- String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]";
+ String usage = "Usage: LanguageIdentifier " +
+ "[-identifyrows filename maxlines] " +
+ "[-identifyfile charset filename] " +
+ "[-identifyfileset charset files] " +
+ "[-identifytext text] " +
+ "[-identifyurl url]";
int command = 0;
final int IDFILE = 1;
@@ -199,6 +217,7 @@
Vector fileset = new Vector();
String filename = "";
+ String charset = "";
String url = "";
String text = "";
int max = 0;
@@ -211,6 +230,7 @@
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-identifyfile")) {
command = IDFILE;
+ charset = args[++i];
filename = args[++i];
}
@@ -233,6 +253,7 @@
if (args[i].equals("-identifyfileset")) {
command = IDFILESET;
+ charset = args[++i];
for (i++; i < args.length; i++) {
File[] files = null;
File f = new File(args[i]);
@@ -264,7 +285,7 @@
case IDFILE:
f = new File(filename);
fis = new FileInputStream(f);
- lang = idfr.identify(fis);
+ lang = idfr.identify(fis, charset);
fis.close();
break;
@@ -302,7 +323,7 @@
filename = (String) i.next();
f = new File(filename);
fis = new FileInputStream(f);
- lang = idfr.identify(fis);
+ lang = idfr.identify(fis, charset);
fis.close();
} catch (Exception e) {
System.out.println(e);
@@ -349,22 +370,26 @@
}
/**
- * Identify language based on submitted content
+ * Identify language of a content.
*
- * @param text to analyze
- * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
- * unknown
+ * @param content is the content to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the specified content.
*/
- public String identify(String text) {
- return identify(new StringBuffer(text));
+ public String identify(String content) {
+ return identify(new StringBuffer(content));
}
/**
- * Identify language based on submitted content
+ * Identify language of a content.
*
- * @param text to analyze
- * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
- * unknown
+ * @param content is the content to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the specified content.
*/
public String identify(StringBuffer content) {
@@ -405,26 +430,48 @@
}
/**
- * Identify language from inputstream
- *
- * @param is
- * @return language code
- * @throws IOException
+ * Identify language from input stream.
+ * This method uses the platform default encoding to read the input stream.
+ * For using a specific encoding, use the
+ * {@link #identify(InputStream, String)} method.
+ *
+ * @param is is the input stream to analyze.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the content of the specified input stream.
+ * @throws IOException if something wrong occurs on the input stream.
*/
public String identify(InputStream is) throws IOException {
+ return identify(is, null);
+ }
+
+ /**
+ * Identify language from input stream.
+ *
+ * @param is is the input stream to analyze.
+ * @param charset is the charset to use to read the input stream.
+ * @return The 2 letter
+ * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ * language code</a> (en, fi, sv, ...) of the language that best
+ * matches the content of the specified input stream.
+ * @throws IOException if something wrong occurs on the input stream.
+ */
+ public String identify(InputStream is, String charset) throws IOException {
- StringBuffer text = new StringBuffer();
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buffer = new byte[2048];
int len = 0;
while (((len = is.read(buffer)) != -1) &&
- ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+ ((analyzeLength == 0) || (out.size() < analyzeLength))) {
if (analyzeLength != 0) {
- len = Math.min(len, analyzeLength - text.length());
+ len = Math.min(len, analyzeLength - out.size());
}
- text.append(new String(buffer, 0, len));
+ out.write(buffer, 0, len);
}
- return identify(text);
+ return identify((charset == null) ? out.toString()
+ : out.toString(charset));
}
}
Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Aug 26 07:54:16 2005
@@ -37,7 +37,7 @@
* information</li>
* <li>Then, checking if a <code>Content-Language</code> HTTP header can be
* found</li>
- * <li>Finaly by analyzing the document content</li>
+ * <li>Finaly by statisticaly analyzing the document content</li>
* </ul>
*
* @author Sami Siren
Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java (original)
+++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java Fri Aug 26 07:54:16 2005
@@ -18,10 +18,19 @@
import org.apache.nutch.searcher.RawFieldQueryFilter;
-/** Handles "lang:" query clauses, causing them to search the "lang" field
- * indexed by LanguageIdentifier. */
+/**
+ * A {@link org.apache.nutch.searcher.QueryFilter} that handles
+ * <code>"lang:"</code> query clauses.
+ * It search the <code>"lang"</code> field indexed by the
+ * LanguageIdentifier.
+ *
+ * @author Sami Siren
+ * @author Jerome Charron
+ */
public class LanguageQueryFilter extends RawFieldQueryFilter {
+
public LanguageQueryFilter() {
super("lang");
}
+
}
Modified: lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=240254&r1=240253&r2=240254&view=diff
==============================================================================
--- lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java (original)
+++ lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Fri Aug 26 07:54:16 2005
@@ -25,6 +25,8 @@
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.List;
import java.util.Iterator;
@@ -42,19 +44,20 @@
/**
- * This class runs a ngram analysis over submitted text, results might be used
- * for automatic language identifiaction.
- *
- * The similarity calculation is at experimental level. You have been warned.
- *
- * Methods are provided to build new NGramProfiles profiles.
+ * This class represents a ngram profile.
+ * A ngram profile is a set of the most frequently used sequences of chars
+ * in a text or set of texts.
+ * This class can be used to runs a ngram analysis over submitted text and
+ * then to build new NGramProfiles.
+ * A profile can then be serialized into a textual file, or a profile can
+ * be initialized from a ngram profile file (ngp files).
*
* @author Sami Siren
- * @author Jerome Charron - http://frutch.free.fr/
+ * @author Jerome Charron
*/
public class NGramProfile {
- public static final Logger LOG = LogFormatter
+ static final Logger LOG = LogFormatter
.getLogger("org.apache.nutch.analysis.lang.NGramProfile");
/** The minimum length allowed for a ngram. */
@@ -119,7 +122,8 @@
}
/**
- * @return Returns the name.
+ * Returns the profile name.
+ * @return the profile name.
*/
public String getName() {
return name;
@@ -178,9 +182,9 @@
}
/**
- * Analyze a piece of text
+ * Analyze a piece of text.
*
- * @param text the text to be analyzed
+ * @param text is the text to be analyzed
*/
public void analyze(StringBuffer text) {
@@ -248,9 +252,11 @@
}
/**
- * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
+ * Return a sorted list of ngrams.
+ * The list is sorted by:
+ * <ol><li>frequency</li><li>sequence</li></ol>
*
- * @return sorted vector of ngrams
+ * @return A sorted list of ngrams
*/
public List getSorted() {
// make sure sorting is done only once
@@ -285,10 +291,10 @@
/**
* Calculate a score how well NGramProfiles match each other
+ * The similarity calculation is at experimental level. You have been warned.
*
- * @param another
- * ngram profile to compare against
- * @return similarity 0=exact match
+ * @param another is the ngram profile to compare against
+ * @return a similarity indicator, where 0 stands for an exact match.
*/
public float getSimilarity(NGramProfile another) {
@@ -322,9 +328,10 @@
}
/**
- * Loads a ngram profile from an InputStream
- * (assumes UTF-8 encoded content)
- * @param is the InputStream to read
+ * Loads a ngram profile from an InputStream.
+ * <i>Please notice, that this method assumes that the stream is UTF-8
+ * encoded</i>.
+ * @param is is the InputStream to read
*/
public void load(InputStream is) throws IOException {
@@ -352,40 +359,43 @@
}
/**
- * Create a new Language profile from (preferably quite large) text file
+ * Create a new ngram profile from an input stream.
+ * <i>Please notice that the size of the submitted content must be quite
+ * large for a good result</i>.
*
- * @param name is thename of profile
- * @param is is the stream to read
- * @param encoding is the encoding of stream
- */
- public static NGramProfile create(String name, InputStream is, String encoding) {
+ * @param name is the name of the profile.
+ * @param is is the stream to read.
+ * @param encoding is the encoding of the stream.
+ */
+ public static NGramProfile create(String name,
+ InputStream is,
+ String encoding)
+ throws UnsupportedEncodingException {
NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
ABSOLUTE_MAX_NGRAM_LENGTH);
BufferedInputStream bis = new BufferedInputStream(is);
-
+ ByteArrayOutputStream bao = new ByteArrayOutputStream();
byte buffer[] = new byte[4096];
- StringBuffer text = new StringBuffer();
int len;
try {
while ((len = bis.read(buffer)) != -1) {
- text.append(new String(buffer, 0, len, encoding));
+ bao.write(buffer, 0, len);
}
} catch (IOException e) {
e.printStackTrace();
}
-
- newProfile.analyze(text);
+ newProfile.analyze(new StringBuffer(bao.toString(encoding)));
return newProfile;
}
/**
- * Writes NGramProfile content into OutputStream, content is outputted with
- * UTF-8 encoding
+ * Writes NGramProfile content into OutputStream.
+ * The content is outputted using UTF-8 encoding.
*
- * @param os the Stream to output to
- * @throws IOException
+ * @param os is the stream to output to.
+ * @throws IOException if something wrong occurs on the output stream.
*/
public void save(OutputStream os) throws IOException {
@@ -424,9 +434,14 @@
}
/**
- * main method used for testing only
- *
- * @param args
+ * Main method used for command line process.
+ * <br/>Usage is:
+ * <pre>
+ * NGramProfile [-create profilename filename encoding]
+ * [-similarity file1 file2]
+ * [-score profile-name filename encoding]
+ * </pre>
+ * @param args arguments.
*/
public static void main(String args[]) {
Re: svn commit: r240254 - in /lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang:
HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java
LanguageQueryFilter.java NGramProfile.java
Posted by Dawid Weiss <da...@cs.put.poznan.pl>.
> It looks like you have commited your changes to tags directory. You
> should do it in branches. I think there is no way in SVN to force
> immutability of tags :(.
Just a thought -- there is at least one way to achieve this -- make a
permission system (either on top of Apache, or internal to Subversion)
and create two writers groups -- one for developers and a 'god mode'
group for those (or that person) who can create and alter tag directory.
D.
Re: svn commit: r240254 - in /lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang: HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java LanguageQueryFilter.java NGramProfile.java
Posted by Jérôme Charron <je...@gmail.com>.
> It looks like you have commited your changes to tags directory. You
> should do it in branches. I think there is no way in SVN to force
> immutability of tags :(.
Oops, sorry.
I commit my changes in the branches directory right now.
Thks Piotr.
Regards
Jerome
--
http://motrech.free.fr/
http://www.frutch.org/
Re: svn commit: r240254 - in /lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang:
HTMLLanguageParser.java LanguageIdentifier.java LanguageIndexingFilter.java
LanguageQueryFilter.java NGramProfile.java
Posted by Piotr Kosiorowski <pk...@gmail.com>.
Hi Jerome,
It looks like you have commited your changes to tags directory. You
should do it in branches. I think there is no way in SVN to force
immutability of tags :(.
Regards
Piotr
jerome@apache.org wrote:
> Author: jerome
> Date: Fri Aug 26 07:54:16 2005
> New Revision: 240254
>
> URL: http://svn.apache.org/viewcvs?rev=240254&view=rev
> Log:
> Javadoc updates, corrections on input stream reading
>
> Modified:
> lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
> lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
> lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
> lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
> lucene/nutch/tags/Release-0.7/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
>