You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/31 10:14:29 UTC

svn commit: r799541 - in /lucene/tika/trunk/tika-core/src: main/java/org/apache/tika/language/ test/java/org/apache/tika/language/

Author: jukka
Date: Fri Jul 31 08:14:29 2009
New Revision: 799541

URL: http://svn.apache.org/viewvc?rev=799541&view=rev
Log:
TIKA-209: Language detection is weak.

Move NGramEntry into a top level class.

Added:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java
Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Jul 31 08:14:29 2009
@@ -31,8 +31,6 @@
 import java.util.Properties;
 import java.util.Vector;
 
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
 /**
  * Identify the language of a content, based on statistical analysis.
  *
@@ -112,7 +110,6 @@
                                 tmpIdx.put(entry, registered);
                             }
                             registered.add(entry);
-                            entry.setProfile(profile);
                         }
                         list.append(" " + lang + "(" + ngrams.size() + ")");
                         is.close();

Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java?rev=799541&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramEntry.java Fri Jul 31 08:14:29 2009
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language;
+
+/**
+ * NGram entry.
+ */
+class NGramEntry implements Comparable<NGramEntry> {
+
+    /**
+     * The ngram profile this entry is related to
+     */
+    private final NGramProfile profile;
+
+    /**
+     * The sequence of characters of the ngram
+     */
+    private CharSequence seq = null;
+
+    /**
+     * The number of occurrences of this ngram in its profile
+     */
+    private int count = 0;
+
+    /**
+     * The frequency of this ngram in its profile. Calculated by the
+     * {@link #calculateFrequency(int)} method.
+     */
+    private float frequency = 0.0F;
+
+    /** 
+     * Constructs a new NGramEntry
+     * @param seq is the sequence of characters of the ngram
+     * @param nGramProfile TODO
+     */
+    public NGramEntry(NGramProfile nGramProfile, CharSequence seq) {
+        this.profile = nGramProfile;
+        this.seq = seq;
+    }
+
+    /** 
+     * Constructs a new NGramEntry
+     * @param seq is the sequence of characters of the ngram
+     * @param count is the number of occurrences of this ngram
+     * @param nGramProfile TODO
+     */
+    public NGramEntry(NGramProfile nGramProfile, String seq, int count) {
+        this.profile = nGramProfile;
+        this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+        this.count = count;
+    }
+
+    /**
+     * Returns the number of occurrences of this ngram in its profile
+     * @return the number of occurrences of this ngram in its profile
+     */
+    public int getCount() {
+        return count;
+    }
+
+    /**
+     * Returns the frequency of this ngram in its profile
+     * @return the frequency of this ngram in its profile
+     */
+    public float getFrequency() {
+        return frequency;
+    }
+
+    public void calculateFrequency(int totalCount) {
+        frequency = (float) count / (float) totalCount;
+    }
+
+    /**
+     * Returns the sequence of characters of this ngram
+     * @return the sequence of characters of this ngram
+     */
+    public CharSequence getSeq() {
+        return seq;
+    }
+
+    /**
+     * Returns the size of this ngram
+     * @return the size of this ngram
+     */
+    public int size() {
+        return seq.length();
+    }
+
+    // Inherited JavaDoc
+    public int compareTo(NGramEntry ngram) {
+        int diff = Float.compare(ngram.getFrequency(), frequency);
+        if (diff != 0) {
+            return diff;
+        } else {
+            return (toString().compareTo(ngram.toString()));
+        }
+    }
+
+    /**
+     * Increments the number of occurrences of this ngram.
+     */
+    public void inc() {
+        count++;
+    }
+
+    /**
+     * Returns the profile associated to this ngram
+     * @return the profile associated to this ngram
+     */
+    public NGramProfile getProfile() {
+        return profile;
+    }
+
+    public String toString() {
+        return "ngram(" + seq + "," + count + "," + frequency + ")";
+    }
+
+    // Inherited JavaDoc
+    public int hashCode() {
+        return seq.hashCode();
+    }
+
+    // Inherited JavaDoc
+    public boolean equals(Object obj) {
+        NGramEntry ngram = null;
+        try {
+            ngram = (NGramEntry) obj;
+            return ngram.seq.equals(seq);
+        } catch (Exception e) {
+            return false;
+        }
+    }
+
+}
\ No newline at end of file

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/NGramProfile.java Fri Jul 31 08:14:29 2009
@@ -148,7 +148,7 @@
         if (cs.equals(SEP_CHARSEQ)) { return; }
         NGramEntry nge = ngrams.get(cs);
         if (nge == null) {
-            nge = new NGramEntry(cs);
+            nge = new NGramEntry(this, cs);
             ngrams.put(cs, nge);
         }
         nge.inc();
@@ -200,26 +200,19 @@
     }
 
     /**
-     * Normalize the profile (calculates the ngrams frequencies)
+     * Normalize the profile (calculates the ngram frequencies)
      */
     protected void normalize() {
-        NGramEntry e = null;
-        //List sorted = getSorted();
-        Iterator<NGramEntry> i = ngrams.values().iterator();
-
-        // Calculate ngramcount if not already done
+        // Calculate ngram count if not already done
         if (ngramcounts == null) {
             ngramcounts = new int[maxLength+1];
-            while (i.hasNext()) {
-                e = i.next();
-                ngramcounts[e.size()] += e.count;
+            for (NGramEntry entry : ngrams.values()) {
+                ngramcounts[entry.size()] += entry.getCount();
             }
         }
 
-        i = ngrams.values().iterator();
-        while (i.hasNext()) {
-            e = i.next();
-            e.frequency = (float) e.count / (float) ngramcounts[e.size()];
+        for (NGramEntry entry : ngrams.values()) {
+            entry.calculateFrequency(ngramcounts[entry.size()]);
         }
     }
 
@@ -247,13 +240,10 @@
         StringBuffer s = new StringBuffer().append("NGramProfile: ")
         .append(name).append("\n");
 
-        Iterator<NGramEntry> i = getSorted().iterator();
-
-        while (i.hasNext()) {
-            NGramEntry entry = i.next();
-            s.append("[").append(entry.seq)
-            .append("/").append(entry.count)
-            .append("/").append(entry.frequency).append("]\n");
+        for (NGramEntry entry : getSorted()) {
+            s.append("    ");
+            s.append(entry);
+            s.append("\n");
         }
         return s.toString();
     }
@@ -272,21 +262,21 @@
             Iterator<NGramEntry> i = another.getSorted().iterator();
             while (i.hasNext()) {
                 NGramEntry other = i.next();
-                if (ngrams.containsKey(other.seq)) {
-                    sum += Math.abs((other.frequency -
-                            ngrams.get(other.seq).frequency)) / 2;
+                if (ngrams.containsKey(other.getSeq())) {
+                    sum += Math.abs((other.getFrequency() -
+                            ngrams.get(other.getSeq()).getFrequency())) / 2;
                 } else {
-                    sum += other.frequency;
+                    sum += other.getFrequency();
                 }
             }
             i = getSorted().iterator();
             while (i.hasNext()) {
                 NGramEntry other = i.next();
-                if (another.ngrams.containsKey(other.seq)) {
-                    sum += Math.abs((other.frequency -
-                            another.ngrams.get(other.seq).frequency)) / 2;
+                if (another.ngrams.containsKey(other.getSeq())) {
+                    sum += Math.abs((other.getFrequency() -
+                            another.ngrams.get(other.getSeq()).getFrequency())) / 2;
                 } else {
-                    sum += other.frequency;
+                    sum += other.getFrequency();
                 }
             }
         } catch (Exception e) {
@@ -315,7 +305,7 @@
                 int len = ngramsequence.length();
                 if ((len >= minLength) && (len <= maxLength)) {
                     int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
-                    NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+                    NGramEntry en = new NGramEntry(this, ngramsequence, ngramcount);
                     ngrams.put(en.getSeq(), en);
                     ngramcounts[len] += ngramcount;
                 }
@@ -486,130 +476,6 @@
         }
     }
 
-    /**
-     * Inner class that describes a NGram
-     */
-    class NGramEntry implements Comparable<NGramEntry> {
-
-        /** The NGRamProfile this NGram is related to */
-        private NGramProfile profile = null;
-
-        /** The sequence of characters of the ngram */
-        CharSequence seq = null;
-
-        /** The number of occurences of this ngram in its profile */
-        private int count = 0;
-
-        /** The frequency of this ngram in its profile */
-        private float frequency = 0.0F;
-
-        /** 
-         * Constructs a new NGramEntry
-         * @param seq is the sequence of characters of the ngram
-         */
-        public NGramEntry(CharSequence seq) {
-            this.seq = seq;
-        }
-
-        /** 
-         * Constructs a new NGramEntry
-         * @param seq is the sequence of characters of the ngram
-         * @param count is the number of occurences of this ngram
-         */
-        public NGramEntry(String seq, int count) {
-            this.seq = new StringBuffer(seq).subSequence(0, seq.length());
-            this.count = count;
-        }
-
-        /**
-         * Returns the number of occurences of this ngram in its profile
-         * @return the number of occurences of this ngram in its profile
-         */
-        public int getCount() {
-            return count;
-        }
-
-        /**
-         * Returns the frequency of this ngram in its profile
-         * @return the frequency of this ngram in its profile
-         */
-        public float getFrequency() {
-            return frequency;
-        }
-
-        /**
-         * Returns the sequence of characters of this ngram
-         * @return the sequence of characters of this ngram
-         */
-        public CharSequence getSeq() {
-            return seq;
-        }
-
-        /**
-         * Returns the size of this ngram
-         * @return the size of this ngram
-         */
-        public int size() {
-            return seq.length();
-        }
-
-        // Inherited JavaDoc
-        public int compareTo(NGramEntry ngram) {
-            int diff = Float.compare(ngram.getFrequency(), frequency);
-            if (diff != 0) {
-                return diff;
-            } else {
-                return (toString().compareTo(ngram.toString()));
-            }
-        }
-
-        /**
-         * Increments the number of occurences of this ngram.
-         */
-        public void inc() {
-            count++;
-        }
-
-        /**
-         * Associated a profile to this ngram
-         * @param profile is the profile associated to this ngram
-         */
-        public void setProfile(NGramProfile profile) {
-            this.profile = profile;
-        }
-
-        /**
-         * Returns the profile associated to this ngram
-         * @return the profile associated to this ngram
-         */
-        public NGramProfile getProfile() {
-            return profile;
-        }
-
-        // Inherited JavaDoc
-        public String toString() {
-            return seq.toString();
-        }
-
-        // Inherited JavaDoc
-        public int hashCode() {
-            return seq.hashCode();
-        }
-
-        // Inherited JavaDoc
-        public boolean equals(Object obj) {
-
-            NGramEntry ngram = null;
-            try {
-                ngram = (NGramEntry) obj;
-                return ngram.seq.equals(seq);
-            } catch (Exception e) {
-                return false;
-            }
-        }
-
-    }
-
     private class QuickStringBuffer implements CharSequence {
 
         private char value[];

Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java Fri Jul 31 08:14:29 2009
@@ -28,8 +28,6 @@
 import junit.framework.TestSuite;
 import junit.textui.TestRunner;
 
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
 /**
  * JUnit based test of class {@link LanguageIdentifier}.
  *

Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java?rev=799541&r1=799540&r2=799541&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestNGramProfile.java Fri Jul 31 08:14:29 2009
@@ -23,8 +23,6 @@
 
 import junit.framework.TestCase;
 
-import org.apache.tika.language.NGramProfile.NGramEntry;
-
 public class TestNGramProfile extends TestCase {
 
     String tokencontent1 = "testaddtoken";