You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/31 10:14:29 UTC
svn commit: r799541 - in /lucene/tika/trunk/tika-core/src:
main/java/org/apache/tika/language/ test/java/org/apache/tika/language/
Author: jukka
Date: Fri Jul 31 08:14:29 2009
New Revision: 799541
URL: http://svn.apache.org/viewvc?rev=799541&view=rev
Log:
TIKA-209: Language detection is weak.
Move NGramEntry into a top level class.
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Jul 31 08:14:29 2009
@@ -31,8 +31,6 @@
import java.util.Properties;
import java.util.Vector;
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
/**
* Identify the language of a content, based on statistical analysis.
*
@@ -112,7 +110,6 @@
tmpIdx.put(entry, registered);
}
registered.add(entry);
- entry.setProfile(profile);
}
list.append(" " + lang + "(" + ngrams.size() + ")");
is.close();
Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java?rev=799541&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java Fri Jul 31 08:14:29 2009
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language;
+
+/**
+ * NGram entry.
+ */
+class NGramEntry implements Comparable<NGramEntry> {
+
+ /**
+ * The ngram profile this entry is related to
+ */
+ private final NGramProfile profile;
+
+ /**
+ * The sequence of characters of the ngram
+ */
+ private CharSequence seq = null;
+
+ /**
+ * The number of occurrences of this ngram in its profile
+ */
+ private int count = 0;
+
+ /**
+ * The frequency of this ngram in its profile. Calculated by the
+ * {@link #calculateFrequency(int)} method.
+ */
+ private float frequency = 0.0F;
+
+ /**
+ * Constructs a new NGramEntry
+ * @param seq is the sequence of characters of the ngram
+ * @param nGramProfile TODO
+ */
+ public NGramEntry(NGramProfile nGramProfile, CharSequence seq) {
+ this.profile = nGramProfile;
+ this.seq = seq;
+ }
+
+ /**
+ * Constructs a new NGramEntry
+ * @param seq is the sequence of characters of the ngram
+ * @param count is the number of occurrences of this ngram
+ * @param nGramProfile TODO
+ */
+ public NGramEntry(NGramProfile nGramProfile, String seq, int count) {
+ this.profile = nGramProfile;
+ this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+ this.count = count;
+ }
+
+ /**
+ * Returns the number of occurrences of this ngram in its profile
+ * @return the number of occurrences of this ngram in its profile
+ */
+ public int getCount() {
+ return count;
+ }
+
+ /**
+ * Returns the frequency of this ngram in its profile
+ * @return the frequency of this ngram in its profile
+ */
+ public float getFrequency() {
+ return frequency;
+ }
+
+ public void calculateFrequency(int totalCount) {
+ frequency = (float) count / (float) totalCount;
+ }
+
+ /**
+ * Returns the sequence of characters of this ngram
+ * @return the sequence of characters of this ngram
+ */
+ public CharSequence getSeq() {
+ return seq;
+ }
+
+ /**
+ * Returns the size of this ngram
+ * @return the size of this ngram
+ */
+ public int size() {
+ return seq.length();
+ }
+
+ // Inherited JavaDoc
+ public int compareTo(NGramEntry ngram) {
+ int diff = Float.compare(ngram.getFrequency(), frequency);
+ if (diff != 0) {
+ return diff;
+ } else {
+ return (toString().compareTo(ngram.toString()));
+ }
+ }
+
+ /**
+ * Increments the number of occurrences of this ngram.
+ */
+ public void inc() {
+ count++;
+ }
+
+ /**
+ * Returns the profile associated to this ngram
+ * @return the profile associated to this ngram
+ */
+ public NGramProfile getProfile() {
+ return profile;
+ }
+
+ public String toString() {
+ return "ngram(" + seq + "," + count + "," + frequency + ")";
+ }
+
+ // Inherited JavaDoc
+ public int hashCode() {
+ return seq.hashCode();
+ }
+
+ // Inherited JavaDoc
+ public boolean equals(Object obj) {
+ NGramEntry ngram = null;
+ try {
+ ngram = (NGramEntry) obj;
+ return ngram.seq.equals(seq);
+ } catch (Exception e) {
+ return false;
+ }
+ }
+
+}
\ No newline at end of file
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java Fri Jul 31 08:14:29 2009
@@ -148,7 +148,7 @@
if (cs.equals(SEP_CHARSEQ)) { return; }
NGramEntry nge = ngrams.get(cs);
if (nge == null) {
- nge = new NGramEntry(cs);
+ nge = new NGramEntry(this, cs);
ngrams.put(cs, nge);
}
nge.inc();
@@ -200,26 +200,19 @@
}
/**
- * Normalize the profile (calculates the ngrams frequencies)
+ * Normalize the profile (calculates the ngram frequencies)
*/
protected void normalize() {
- NGramEntry e = null;
- //List sorted = getSorted();
- Iterator<NGramEntry> i = ngrams.values().iterator();
-
- // Calculate ngramcount if not already done
+ // Calculate ngram count if not already done
if (ngramcounts == null) {
ngramcounts = new int[maxLength+1];
- while (i.hasNext()) {
- e = i.next();
- ngramcounts[e.size()] += e.count;
+ for (NGramEntry entry : ngrams.values()) {
+ ngramcounts[entry.size()] += entry.getCount();
}
}
- i = ngrams.values().iterator();
- while (i.hasNext()) {
- e = i.next();
- e.frequency = (float) e.count / (float) ngramcounts[e.size()];
+ for (NGramEntry entry : ngrams.values()) {
+ entry.calculateFrequency(ngramcounts[entry.size()]);
}
}
@@ -247,13 +240,10 @@
StringBuffer s = new StringBuffer().append("NGramProfile: ")
.append(name).append("\n");
- Iterator<NGramEntry> i = getSorted().iterator();
-
- while (i.hasNext()) {
- NGramEntry entry = i.next();
- s.append("[").append(entry.seq)
- .append("/").append(entry.count)
- .append("/").append(entry.frequency).append("]\n");
+ for (NGramEntry entry : getSorted()) {
+ s.append(" ");
+ s.append(entry);
+ s.append("\n");
}
return s.toString();
}
@@ -272,21 +262,21 @@
Iterator<NGramEntry> i = another.getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = i.next();
- if (ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.frequency -
- ngrams.get(other.seq).frequency)) / 2;
+ if (ngrams.containsKey(other.getSeq())) {
+ sum += Math.abs((other.getFrequency() -
+ ngrams.get(other.getSeq()).getFrequency())) / 2;
} else {
- sum += other.frequency;
+ sum += other.getFrequency();
}
}
i = getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = i.next();
- if (another.ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.frequency -
- another.ngrams.get(other.seq).frequency)) / 2;
+ if (another.ngrams.containsKey(other.getSeq())) {
+ sum += Math.abs((other.getFrequency() -
+ another.ngrams.get(other.getSeq()).getFrequency())) / 2;
} else {
- sum += other.frequency;
+ sum += other.getFrequency();
}
}
} catch (Exception e) {
@@ -315,7 +305,7 @@
int len = ngramsequence.length();
if ((len >= minLength) && (len <= maxLength)) {
int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
- NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+ NGramEntry en = new NGramEntry(this, ngramsequence, ngramcount);
ngrams.put(en.getSeq(), en);
ngramcounts[len] += ngramcount;
}
@@ -486,130 +476,6 @@
}
}
- /**
- * Inner class that describes a NGram
- */
- class NGramEntry implements Comparable<NGramEntry> {
-
- /** The NGRamProfile this NGram is related to */
- private NGramProfile profile = null;
-
- /** The sequence of characters of the ngram */
- CharSequence seq = null;
-
- /** The number of occurences of this ngram in its profile */
- private int count = 0;
-
- /** The frequency of this ngram in its profile */
- private float frequency = 0.0F;
-
- /**
- * Constructs a new NGramEntry
- * @param seq is the sequence of characters of the ngram
- */
- public NGramEntry(CharSequence seq) {
- this.seq = seq;
- }
-
- /**
- * Constructs a new NGramEntry
- * @param seq is the sequence of characters of the ngram
- * @param count is the number of occurences of this ngram
- */
- public NGramEntry(String seq, int count) {
- this.seq = new StringBuffer(seq).subSequence(0, seq.length());
- this.count = count;
- }
-
- /**
- * Returns the number of occurences of this ngram in its profile
- * @return the number of occurences of this ngram in its profile
- */
- public int getCount() {
- return count;
- }
-
- /**
- * Returns the frequency of this ngram in its profile
- * @return the frequency of this ngram in its profile
- */
- public float getFrequency() {
- return frequency;
- }
-
- /**
- * Returns the sequence of characters of this ngram
- * @return the sequence of characters of this ngram
- */
- public CharSequence getSeq() {
- return seq;
- }
-
- /**
- * Returns the size of this ngram
- * @return the size of this ngram
- */
- public int size() {
- return seq.length();
- }
-
- // Inherited JavaDoc
- public int compareTo(NGramEntry ngram) {
- int diff = Float.compare(ngram.getFrequency(), frequency);
- if (diff != 0) {
- return diff;
- } else {
- return (toString().compareTo(ngram.toString()));
- }
- }
-
- /**
- * Increments the number of occurences of this ngram.
- */
- public void inc() {
- count++;
- }
-
- /**
- * Associated a profile to this ngram
- * @param profile is the profile associated to this ngram
- */
- public void setProfile(NGramProfile profile) {
- this.profile = profile;
- }
-
- /**
- * Returns the profile associated to this ngram
- * @return the profile associated to this ngram
- */
- public NGramProfile getProfile() {
- return profile;
- }
-
- // Inherited JavaDoc
- public String toString() {
- return seq.toString();
- }
-
- // Inherited JavaDoc
- public int hashCode() {
- return seq.hashCode();
- }
-
- // Inherited JavaDoc
- public boolean equals(Object obj) {
-
- NGramEntry ngram = null;
- try {
- ngram = (NGramEntry) obj;
- return ngram.seq.equals(seq);
- } catch (Exception e) {
- return false;
- }
- }
-
- }
-
private class QuickStringBuffer implements CharSequence {
private char value[];
Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java Fri Jul 31 08:14:29 2009
@@ -28,8 +28,6 @@
import junit.framework.TestSuite;
import junit.textui.TestRunner;
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
/**
* JUnit based test of class {@link LanguageIdentifier}.
*
Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java Fri Jul 31 08:14:29 2009
@@ -23,8 +23,6 @@
import junit.framework.TestCase;
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
public class TestNGramProfile extends TestCase {
String tokencontent1 = "testaddtoken";