You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/31 09:33:20 UTC

svn commit: r799531 - in /lucene/tika/trunk/tika-core/src: main/java/org/apache/tika/language/ main/resources/org/apache/tika/language/ test/java/org/apache/tika/language/ test/resources/org/apache/tika/language/

Author: jukka
Date: Fri Jul 31 07:33:19 2009
New Revision: 799531

URL: http://svn.apache.org/viewvc?rev=799531&view=rev
Log:
TIKA-209: Language detection is weak.

Move the ngram resources to src/*/resources.

Added:
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/da.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/da.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/de.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/de.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/ee.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ee.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/el.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/el.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/en.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/en.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/es.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/es.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/fi.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fi.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/fr.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fr.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/hu.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/hu.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/is.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/is.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/it.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/it.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/langmappings.properties
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/langmappings.properties
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/nl.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/nl.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/no.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/no.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/pl.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pl.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/pt.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pt.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/ru.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ru.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/sv.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/sv.ngp
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/th.ngp
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/th.ngp
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/da.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/da.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/de.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/de.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/el.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/el.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/en.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/en.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/es.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/es.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/fi.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fi.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/fr.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fr.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/it.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/it.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/nl.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/nl.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/pt.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/pt.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/sv.test
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/sv.test
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/test-referencial.txt
      - copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/test-referencial.txt
Removed:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/da.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/de.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ee.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/el.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/en.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/es.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fi.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fr.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/hu.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/is.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/it.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/langmappings.properties
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/nl.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/no.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pl.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pt.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ru.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/sv.ngp
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/th.ngp
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/da.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/de.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/el.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/en.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/es.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fi.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fr.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/it.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/nl.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/pt.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/sv.test
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/test-referencial.txt
Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=799531&r1=799530&r2=799531&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Jul 31 07:33:19 2009
@@ -98,7 +98,7 @@
         String lang = (String) (alllanguages.nextElement());
 
         InputStream is = this.getClass().getClassLoader().getResourceAsStream(
-                "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.FILE_EXTENSION);
+                "org/apache/tika/language/" + lang + "." + NGramProfile.FILE_EXTENSION);
 
         if (is != null) {
           NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
@@ -137,6 +137,7 @@
       // Create the suspect profile
       suspect = new NGramProfile("suspect", minLength, maxLength);
     } catch (Exception e) {
+        e.printStackTrace();
       // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
     }
   }

Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java?rev=799531&r1=799530&r2=799531&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java Fri Jul 31 07:33:19 2009
@@ -180,8 +180,7 @@
     }
   }
 
-    // Disable until the resource files are properly located
-    public void disabledTtestIdentify() {
+    public void testIdentify() {
         try {
             long total = 0;
             LanguageIdentifier idfr = new LanguageIdentifier();