You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/31 09:33:20 UTC
svn commit: r799531 - in /lucene/tika/trunk/tika-core/src:
main/java/org/apache/tika/language/ main/resources/org/apache/tika/language/
test/java/org/apache/tika/language/ test/resources/org/apache/tika/language/
Author: jukka
Date: Fri Jul 31 07:33:19 2009
New Revision: 799531
URL: http://svn.apache.org/viewvc?rev=799531&view=rev
Log:
TIKA-209: Language detection is weak.
Move the ngram resources to src/*/resources.
Added:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/da.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/da.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/de.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/de.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/ee.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ee.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/el.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/el.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/en.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/en.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/es.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/es.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/fi.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fi.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/fr.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fr.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/hu.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/hu.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/is.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/is.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/it.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/it.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/langmappings.properties
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/langmappings.properties
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/nl.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/nl.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/no.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/no.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/pl.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pl.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/pt.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pt.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/ru.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ru.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/sv.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/sv.ngp
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/language/th.ngp
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/th.ngp
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/da.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/da.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/de.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/de.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/el.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/el.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/en.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/en.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/es.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/es.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/fi.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fi.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/fr.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fr.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/it.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/it.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/nl.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/nl.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/pt.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/pt.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/sv.test
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/sv.test
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/test-referencial.txt
- copied unchanged from r799462, lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/test-referencial.txt
Removed:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/da.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/de.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ee.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/el.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/en.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/es.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fi.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/fr.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/hu.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/is.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/it.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/langmappings.properties
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/nl.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/no.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pl.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/pt.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/ru.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/sv.ngp
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/th.ngp
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/da.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/de.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/el.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/en.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/es.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fi.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/fr.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/it.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/nl.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/pt.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/sv.test
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/test-referencial.txt
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=799531&r1=799530&r2=799531&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java Fri Jul 31 07:33:19 2009
@@ -98,7 +98,7 @@
String lang = (String) (alllanguages.nextElement());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(
- "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.FILE_EXTENSION);
+ "org/apache/tika/language/" + lang + "." + NGramProfile.FILE_EXTENSION);
if (is != null) {
NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
@@ -137,6 +137,7 @@
// Create the suspect profile
suspect = new NGramProfile("suspect", minLength, maxLength);
} catch (Exception e) {
+ e.printStackTrace();
// if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
}
}
Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java?rev=799531&r1=799530&r2=799531&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/TestLanguageIdentifier.java Fri Jul 31 07:33:19 2009
@@ -180,8 +180,7 @@
}
}
- // Disable until the resource files are properly located
- public void disabledTtestIdentify() {
+ public void testIdentify() {
try {
long total = 0;
LanguageIdentifier idfr = new LanguageIdentifier();