You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2012/08/02 14:11:52 UTC
svn commit: r1368432 - in
/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect:
LanguageDetectionEnhancementEngine.java LanguageIdentifier.java
Author: wkasper
Date: Thu Aug 2 12:11:51 2012
New Revision: 1368432
URL: http://svn.apache.org/viewvc?rev=1368432&view=rev
Log:
Add confidence value for best language hypothesis
Modified:
incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageIdentifier.java
Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java?rev=1368432&r1=1368431&r2=1368432&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java Thu Aug 2 12:11:51 2012
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.DCTERMS_LINGUISTIC_SYSTEM;
import java.io.IOException;
@@ -30,6 +31,7 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
+import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
@@ -55,6 +57,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.cybozu.labs.langdetect.LangDetectException;
+import com.cybozu.labs.langdetect.Language;
/**
* {@link LanguageDetectionEnhancementEngine} provides functionality to enhance document
@@ -111,6 +114,12 @@ public class LanguageDetectionEnhancemen
*/
private int probeLength = PROBE_LENGTH_DEFAULT;
+ /**
+ * The literal factory
+ */
+ private final LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+
private LanguageIdentifier languageIdentifier;
/**
@@ -170,10 +179,10 @@ public class LanguageDetectionEnhancemen
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
- String language = null;
+ List<Language> languages = null;
try {
- language = languageIdentifier.getLanguage(text);
- log.info("language identified as " + language);
+ languages = languageIdentifier.getLanguages(text);
+ log.info("language identified: {}",languages);
}
catch (LangDetectException e) {
log.warn("Could not identify language");
@@ -181,40 +190,20 @@ public class LanguageDetectionEnhancemen
}
// add language to metadata
- MGraph g = ci.getMetadata();
- ci.getLock().writeLock().lock();
- try {
- UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
- g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
- } finally {
- ci.getLock().writeLock().unlock();
- }
- }
-
- public List<String> loadProfiles(String folder, String configFile) throws Exception {
- List<String> profiles = new ArrayList<String>();
- java.util.Properties props = new java.util.Properties();
- props.load(getClass().getClassLoader().getResourceAsStream(configFile));
- String languages = props.getProperty("languages");
- if (languages == null) {
- throw new IOException("No languages defined");
- }
- for (String lang: languages.split(",")) {
- String profileFile = folder+"/"+lang;
- InputStream is = getClass().getClassLoader().getResourceAsStream(profileFile);
- String profile;
+ if (languages.size() > 0) {
+ MGraph g = ci.getMetadata();
+ ci.getLock().writeLock().lock();
+ // add best hypothesis
+ Language oneLang = languages.get(0);
try {
- profile = IOUtils.toString(is, "UTF-8");
- if (profile != null && profile.length() > 0) {
- profiles.add(profile);
- }
- is.close();
- } catch (IOException e) {
- e.printStackTrace();
+ UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(oneLang.lang)));
+ g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(oneLang.prob)));
+ g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
+ } finally {
+ ci.getLock().writeLock().unlock();
}
}
- return profiles;
}
public int getProbeLength() {
Modified: incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageIdentifier.java?rev=1368432&r1=1368431&r2=1368432&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageIdentifier.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageIdentifier.java Thu Aug 2 12:11:51 2012
@@ -26,6 +26,7 @@ import org.apache.commons.io.IOUtils;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
+import com.cybozu.labs.langdetect.Language;
/**
* Standalone version of the Language Identifier
@@ -79,5 +80,11 @@ public class LanguageIdentifier {
detector.append(text);
return detector.detect();
}
+
+ public ArrayList<Language> getLanguages(String text) throws LangDetectException {
+ Detector detector = DetectorFactory.create();
+ detector.append(text);
+ return detector.getProbabilities();
+ }
}