You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2010/12/02 12:30:37 UTC
svn commit: r1041331 [1/2] - in /incubator/stanbol/trunk/iks-autotagging: ./
samples/ src/ src/main/ src/main/java/ src/main/java/eu/
src/main/java/eu/iks/ src/main/java/eu/iksproject/
src/main/java/eu/iksproject/autotagging/ src/main/java/eu/iksprojec...
Author: ogrisel
Date: Thu Dec 2 11:30:36 2010
New Revision: 1041331
URL: http://svn.apache.org/viewvc?rev=1041331&view=rev
Log:
temporary import of iks-autotagging that will eventually be replaced by an equivalent implementation in rick
Added:
incubator/stanbol/trunk/iks-autotagging/
incubator/stanbol/trunk/iks-autotagging/LICENSE.txt
incubator/stanbol/trunk/iks-autotagging/README.txt
incubator/stanbol/trunk/iks-autotagging/pom.xml
incubator/stanbol/trunk/iks-autotagging/samples/
incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt
incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt
incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt
incubator/stanbol/trunk/iks-autotagging/src/
incubator/stanbol/trunk/iks-autotagging/src/main/
incubator/stanbol/trunk/iks-autotagging/src/main/java/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iks/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java
incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java
incubator/stanbol/trunk/iks-autotagging/src/main/resources/
incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/
incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF
incubator/stanbol/trunk/iks-autotagging/src/test/
incubator/stanbol/trunk/iks-autotagging/src/test/java/
incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/
incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/
incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/
incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java
incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java
incubator/stanbol/trunk/iks-autotagging/src/test/resources/
incubator/stanbol/trunk/iks-autotagging/src/test/resources/dbpedia_3.4_instancetype_en.nt
incubator/stanbol/trunk/iks-autotagging/src/test/resources/dbpedia_3.4_longabstract_en.nt
Added: incubator/stanbol/trunk/iks-autotagging/LICENSE.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/LICENSE.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/LICENSE.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/LICENSE.txt Thu Dec 2 11:30:36 2010
@@ -0,0 +1,25 @@
+Copyright 2010 IKS Consortium. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ of conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY IKS CONSORTIUM ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IKS CONSORTIUM OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those of the
+authors and should not be interpreted as representing official policies, either expressed
+or implied, of IKS Consortium.
Added: incubator/stanbol/trunk/iks-autotagging/README.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/README.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/README.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/README.txt Thu Dec 2 11:30:36 2010
@@ -0,0 +1,184 @@
+IKS Autotagging
+===============
+
+:author: ogrisel@nuxeo.com
+
+Text document classification / topic assignment service based on the text
+content of DBpedia. The implementation is based on lucene and the
+`MoreLikeThis` similarity query that leverages term frequencies of the index.
+
+
+Building
+========
+
+1- Download maven_ and ensure that the `mvn` command is registered in your
+`PATH` environment variable.
+
+2- From the top of the `iks-autoagging/` folder build using maven as usual (this
+ will install the `iks-autotagging-X.X.X-SNAPSHOT.jar` jar in your local maven
+ repository and make it available to dependent projects such as FISE)::
+
+ % mvn install
+
+3- From the same folder, build a standalone jar suitable for commandline
+ usage. The resulting jar will be named:
+ `target/iks-autotagging-X.X.X-SNAPSHOT-jar-with-dependencies.jar`::
+
+ % mvn assembly:assembly
+
+4- (Optional) To import the project in eclipse, first run::
+
+ % mvn eclipse:eclipse
+
+This will generate `.project` and `.classpath` files to that you can
+"Import > Import existing projects into workspace" from the Eclipse
+UI. Alternatively you can install the `m2eclipse` plugin to directly
+import maven projects into eclipse.
+
+.. _maven: http://maven.apache.org
+
+
+Command line usage
+==================
+
+You can use the autotagger with a default embedded lucene index of
+the top 10000 entities of DBpedia:
+
+ % java -jar target/iks-autotagging-*-SNAPSHOT-jar-with-dependencies.jar \
+ suggest -f samples/bob_marley.txt
+ [...]
+ Annotating 'samples/bob_marley.txt'... done in 739ms:
+ Suggestion #1 (score: 4.648216): 'Bob Marley'
+ URI: http://dbpedia.org/resource/Bob_Marley
+ type: http://www.w3.org/2002/07/owl#Thing
+ type: http://dbpedia.org/ontology/Person
+ type: http://dbpedia.org/ontology/Artist
+ type: http://dbpedia.org/ontology/MusicalArtist
+ Suggestion #2 (score: 0.127039): 'Bunny Wailer'
+ URI: http://dbpedia.org/resource/Bunny_Wailer
+ type: http://www.w3.org/2002/07/owl#Thing
+ type: http://dbpedia.org/ontology/Person
+ type: http://dbpedia.org/ontology/Artist
+ type: http://dbpedia.org/ontology/MusicalArtist
+ Suggestion #3 (score: 0.121009): 'Desmond Dekker'
+ URI: http://dbpedia.org/resource/Desmond_Dekker
+ type: http://www.w3.org/2002/07/owl#Thing
+ type: http://dbpedia.org/ontology/Person
+ type: http://dbpedia.org/ontology/Artist
+ type: http://dbpedia.org/ontology/MusicalArtist
+
+For better recall performance it is strongly recommended
+to use a more comprehensive index of DBpedia entities.
+
+To do so you first need to build or download a dedicated
+DBpedia lucene index in a folder named `/path/to/lucene-idx` (for instance)
+on the local filesystem (see later sections for instructions). You can download
+a prebuilt index from here:
+
+ http://dl.dropbox.com/u/5743203/IKS/autotagging/iks-dbpedia-lucene-idx-20100331-0.tar.bz2
+
+(A better index is currently under construction...)
+
+You can then add the "-i /path/to/lucene-idx" option to the
+previous command line to use your custom index.
+
+Instructions for building your own index from scratch are available in the following
+sections.
+
+
+Restful API
+===========
+
+:TODO: implement me first!
+
+Launch a HTTP server to provide the service using a RESTful API thanks to jetty
+and Jersey::
+
+ % mvn jetty:run
+ % curl -T file-to-anotate.txt http://localhost:8080/autotagging
+
+RDF/JSON annotations could be serialized using this convents: http://jdil.org/.
+
+Also the FISE project features an OSGi embedding of this library combined with
+RESTful interface and persistent annotation and content stores::
+
+ http://code.google.com/p/iks-project/source/browse/sandbox/fise/trunk/
+
+
+Building a lucene index from DBpedia dumps
+==========================================
+
+1- Download and uncompress (using `bzip2 -d <filename>`)the following datasets from DBpedia:
+
+ - instancetype_en.nt.bz2_
+
+ - longabstract_en_nt.bz2_
+
+ - article_label_en_nt.bz2_
+
+.. _instancetype_en_nt.bz2: http://downloads.dbpedia.org/3.4/en/instancetype_en.nt.bz2
+.. _longabstract_en_nt.bz2: http://downloads.dbpedia.org/3.4/en/longabstract_en.nt.bz2
+.. _article_label_en_nt.bz2: http://downloads.dbpedia.org/3.4/en/article_label.nt.bz2
+
+
+2- Build a temporary Jena TDB store::
+
+ % java -Xmx2g -server -jar target/iks-autotagging-*-SNAPSHOT-jar-with-dependencies.jar \
+ model /path/to/dbpedia-tdb /path/to/instancetype_en.nt /path/to/longabstract.nt /path/to/articles_label_en.nt
+
+Alternatively you can download and use the `bin/tdbloader` tool from the TDB
+distribution.
+
+3- Index the Jena TDB into a Lucene `FSDirectory`::
+
+ % java -Xmx2g -server -jar target/iks-autotagging-*-SNAPSHOT-jar-with-dependencies.jar \
+ index /path/to/dbpedia-tdb /path/to/lucene-directory
+
+You can then use luke_ to check the content of the resulting index::
+
+ % java -jar /path/to/lukeall-1.0.0.jar -index /path/to/lucene-idx
+
+.. _luke: http://www.getopt.org/luke/
+
+
+Recently implemented
+====================
+
+0- Finish implementing the `JenaIndexer#main` method to be able to create a Jena
+ TDB store out of DBpedia dumps from the command line.
+
+1- Use the lucene `ShingleFilter` to generate bi-grams (or tri-grams) of token
+ and improve the accuracy of the results at the expense of the size of the
+ index.
+
+2- Extend the `Autotagger` API to allow the requester to ask for a
+ specific entity type (useful to combine with the output of a Named Entity
+ detection module).
+
+4- Extend the `TagInfo` class to feedback the caller with confidence levels
+ for each suggestions (from lucene scores).
+
+5- Improve the `JenaIndexer` to index other text literal sources such as labels,
+ comments, ...
+
+6- Build a small index of the most popular people / place / organization from
+ DBpedia to be packaged easily as the default IKS model
+
+
+Roadmap
+=======
+
+3- Implement standalone jersey-based JAX-RS components that takes the text
+ content as an input and output suggested annotations as RDF/XML of RDF/JSON
+ synchronously.
+
+7- Index the DBpedia categories dumps and by aggregating literal text from
+ directly related entities and propose suggestions with type topic to
+ complement entities typed tags.
+
+8- Use a complete wikimedia markup dump as a fulltext source for the index
+ instead of just DBpedia
+
+9- Index the textual context (enclosing paragraph) of all incoming links
+ to an entity coming from other wikipedia articles.
+
Added: incubator/stanbol/trunk/iks-autotagging/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/pom.xml?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/pom.xml (added)
+++ incubator/stanbol/trunk/iks-autotagging/pom.xml Thu Dec 2 11:30:36 2010
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project>
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>eu.iksproject</groupId>
+ <artifactId>iks-autotagging</artifactId>
+ <version>0.1.0-SNAPSHOT</version>
+ <name>iks-autotagging</name>
+ <description>Service to assign DBpedia-based resources to unstructured
+ text content as related entities (Person, Place, Organization) or
+ topics (a.k.a.wikipedia categories).</description>
+ <repositories>
+ <repository>
+ <id>central</id>
+ <url>http://repo1.maven.org/maven2</url>
+ </repository>
+ <repository>
+ <!--
+ needed for the default model data while we decide where to put IKS
+ artifacts
+ -->
+ <id>nuxeo-vendor-release</id>
+ <url>https://maven.nuxeo.org/nexus/content/repositories/vendor-releases</url>
+ </repository>
+ </repositories>
+ <dependencies>
+ <dependency>
+ <groupId>eu.iksproject</groupId>
+ <artifactId>iks-autotagging-data</artifactId>
+ <version>0.1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.4</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>1.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>3.0.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queries</artifactId>
+ <version>3.0.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers</artifactId>
+ <version>3.0.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.hp.hpl.jena</groupId>
+ <artifactId>jena</artifactId>
+ <version>2.6.2</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.hp.hpl.jena</groupId>
+ <artifactId>arq</artifactId>
+ <version>2.8.2</version>
+ <exclusions>
+ <exclusion>
+ <groupId>com.sun.jmx</groupId>
+ <artifactId>jmxri</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>javax.jms</groupId>
+ <artifactId>jms</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jdmk</groupId>
+ <artifactId>jmxtools</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.hp.hpl.jena</groupId>
+ <artifactId>tdb</artifactId>
+ <version>0.8.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.5.8</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <version>1.5.8</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.7</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.5</source>
+ <target>1.5</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>2.2-beta-5</version>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ <archive>
+ <manifest>
+ <mainClass>eu.iksproject.autotagging.cli.CommandLineRunner</mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
Added: incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt Thu Dec 2 11:30:36 2010
@@ -0,0 +1,15 @@
+Robert Nesta "Bob" Marley (February 6, 1945 â May 11, 1981) was
+a Jamaican singer-songwriter and musician. He was the lead singer,
+songwriter and guitarist for the ska, rocksteady and reggae bands The
+Wailers (1964â1974) and Bob Marley & The Wailers (1974â1981). Marley
+remains the most widely known and revered performer of reggae music,
+and is credited for helping spread both Jamaican music and the Rastafari
+movement to a worldwide audience.
+
+Marley's best known hits include "I Shot the Sheriff", "No Woman, No
+Cry", "Could You Be Loved", "Stir It Up", "Jamming", "Redemption Song",
+"One Love" and, together with The Wailers, "Three Little Birds", as well
+as the posthumous releases "Buffalo Soldier" and "Iron Lion Zion". The
+compilation album, Legend (1984), released three years after his death,
+is reggae's best-selling album, being 10 times Platinum (Diamond) in
+the U.S., and selling 20 million copies worldwide.
Added: incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt Thu Dec 2 11:30:36 2010
@@ -0,0 +1,36 @@
+James Marshall "Jimi" Hendrix (born Johnny Allen Hendrix; November
+27, 1942 â September 18, 1970) was an American guitarist, singer and
+songwriter. He is often considered to be the greatest electric guitarist
+in the history of rock music by other musicians and commentators in
+the industry, and one of the most important and influential musicians
+of his era across a range of genres. After initial success in Europe,
+he achieved fame in the United States following his 1967 performance
+at the Monterey Pop Festival. Later, Hendrix headlined the iconic
+1969 Woodstock Festival and the 1970 Isle of Wight Festival. Hendrix
+often favored raw overdriven amplifiers with high gain and treble and
+helped develop the previously undesirable technique of guitar amplifier
+feedback. Hendrix was one of the musicians who popularized the wah-wah
+pedal in mainstream rock which he often used to deliver an exaggerated
+pitch in his solos, particularly with high bends and use of legato based
+around the pentatonic scale. He was influenced by blues artists such as
+B.B. King, Muddy Waters, Howlin' Wolf, Albert King, and Elmore James,
+rhythm and blues and soul guitarists Curtis Mayfield, Steve Cropper, as
+well as by some modern jazz. In 1966, Hendrix, who played and recorded
+with Little Richard's band from 1964 to 1965, said, "I want to do with
+my guitar what Little Richard does with his voice."
+
+As a record producer, Hendrix also broke new ground in using the recording
+studio as an extension of his musical ideas. He was one of the first to
+experiment with stereophonic and phasing effects for rock recording.
+
+Hendrix won many of the most prestigious rock music awards in his
+lifetime, and has been posthumously awarded many more, including being
+inducted into the US Rock and Roll Hall of Fame in 1992 and the UK Music
+Hall of Fame in 2005. An English Heritage blue plaque was erected in
+his name on his former residence at Brook Street, London, in September
+1997. A star on the Hollywood Walk of Fame (at 6627 Hollywood Blvd.) was
+dedicated in 1994. In 2006, his debut US album, Are You Experienced,
+was inducted into the United States National Recording Registry, and
+Rolling Stone named Hendrix the top guitarist on its list of the 100
+greatest guitarists of all-time in 2003. He was also the first person
+inducted into the Native American Music Hall of Fame.
Added: incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt Thu Dec 2 11:30:36 2010
@@ -0,0 +1,25 @@
+The Russian government has decided to remove two of its eleven timezones,
+in the country's first step towards time reform, first started by
+president Dmitriy Medvedev in last November.
+
+The affected regions were Chukotka, the easternmost province of Russia
+were moved back an hour, as were and Samara and Udmurtia, which are now
+on Moscow time. The changes were implemented on Saturday night, when
+most of the country was due to put their country ahead for summer time;
+however, affected areas instead didn't change their clocks at all.
+
+"It's possible that this could also aid the strengthening of Russia's
+position as a link in the global information infrastructure," Medvedev
+remarked earlier this month. "Reducing of amount of time zones is very
+efficient for managing, for accordance of actions, for approximation of
+far regions to the center," commented Arkady Tishkov, who is a deputy
+science director of Geography Institution for the Russian Academy
+of Sciences. Tishkov speculated that the number of time zones could
+eventually be reduced to six. Meanwhile, an online petition has been
+posted opposing the time change for the Samara province, and it has
+garnered close to 13,000 signatures. "Trips take place to many regions
+of the country and world where time, you understand, far from always
+corresponds with Moscow," the text of the petition read, adding that
+"In the winter, darkness will come almost at lunchtime, which isn't
+convenient and is psychologically quite hard."
+
Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,328 @@
+package eu.iksproject.autotagging;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.similar.MoreLikeThis;
+import org.apache.lucene.search.similar.MoreLikeThisQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.iksproject.autotagging.jena.ModelIndexer;
+
+/**
+ * Engine that uses a Lucene index of DBpedia entities (types and abstracts) to
+ * suggest the top 3 entities that are semantically related to the text content
+ * to annotate.
+ *
+ * @author ogrisel
+ */
+public class Autotagger {
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ public String typeFieldName = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
+
+ private final String lookupFieldName = "http://www.w3.org/2000/01/rdf-schema#label";
+
+ private String[] likeFieldNames = {
+ "http://www.w3.org/2000/01/rdf-schema#label",
+ "http://dbpedia.org/property/abstract" };
+
+ private String idField = ModelIndexer.URI_FIELD;
+
+ private int maxSuggestions = 3;
+
+ private float lookupBoost = 2f;
+
+ private float contextBoost = 1f;
+
+ private Analyzer analyzer = getDefaultAnalyzer();
+
+ private String typePrefix = "http://dbpedia.org/ontology/";
+
+ private boolean strictLookup = true;
+
+ private final Directory directory;
+
+ public Autotagger(Directory directory) {
+ this.directory = directory;
+ }
+
+ public static Analyzer getDefaultAnalyzer() {
+ return new StandardAnalyzer(Version.LUCENE_30);
+ }
+
+ public Analyzer getAnalyzer(boolean withShingles) {
+ if (withShingles) {
+ return new ShingleAnalyzerWrapper(analyzer);
+ } else {
+ return analyzer;
+ }
+ }
+
+ public Autotagger withFieldNames(String[] fieldNames) {
+ this.likeFieldNames = fieldNames;
+ return this;
+ }
+
+ public Autotagger withIdFieldName(String idField) {
+ this.idField = idField;
+ return this;
+ }
+
+ public Autotagger withMaxSuggestions(int maxSuggestions) {
+ this.maxSuggestions = maxSuggestions;
+ return this;
+ }
+
+ public Autotagger withAnalyzer(Analyzer analyzer) {
+ this.analyzer = analyzer;
+ return this;
+ }
+
+ public Autotagger withLookupBoost(float lookupBoost) {
+ this.lookupBoost = lookupBoost;
+ return this;
+ }
+
+ public Autotagger withContextBoost(float contextBoost) {
+ this.contextBoost = contextBoost;
+ return this;
+ }
+
+ public Autotagger withTypePrefix(String typePrefix) {
+ this.typePrefix = typePrefix;
+ return this;
+ }
+
+ public Autotagger withStrictNameLookup(boolean strictLookup) {
+ this.strictLookup = strictLookup;
+ return this;
+ }
+
+ /**
+ * Suggest entities that are textually similar to the given text.
+ *
+ * @param text
+ * @return entities info that best match the text
+ * @throws CorruptIndexException
+ * @throws IOException
+ */
+ public List<TagInfo> suggest(String text) throws CorruptIndexException,
+ IOException {
+ return suggest(text, null);
+ }
+
+ /**
+ * Suggest entities that are textually similar to the given text. If the
+ * text is short enough, a fuzzy name lookup is performed instead. Further
+ * restrict the results to match the field values given in the fieldFilter
+ *
+ * @param text the textual content used for similarity search
+ * @param fieldFilters
+ * @return entities info that best match the text
+ * @throws CorruptIndexException
+ * @throws IOException
+ */
+ public List<TagInfo> suggest(String text,
+ Map<String, List<String>> fieldFilters)
+ throws CorruptIndexException, IOException {
+
+ // count tokens using the analyzer
+ TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
+ int tokens = 0;
+ while (tokenStream.incrementToken()) {
+ tokens++;
+ }
+ if (tokens > 3) {
+ // this is a context based suggestion
+ return suggest(null, text, fieldFilters);
+ } else {
+ // this is a name lookup
+ return suggest(text, null, fieldFilters);
+ }
+ }
+
+ /**
+ * Suggest entities that are fuzzy matching the given name and/or textually
+ * similar to the given context. Further restrict the results to match the
+ * field values given in the fieldFilter
+ *
+ * @param text the textual content used for similarity search
+ * @param fieldFilters
+ * @return entities info that best match the text
+ * @throws CorruptIndexException
+ * @throws IOException
+ */
+ public List<TagInfo> suggest(String name, String context,
+ Map<String, List<String>> fieldFilters)
+ throws CorruptIndexException, IOException {
+
+ if ((name == null || name.length() == 0)
+ && (context == null || context.length() == 0)) {
+ throw new IllegalArgumentException(
+ "name and context value cannot be both null or empty");
+ }
+
+ List<TagInfo> suggestions = new ArrayList<TagInfo>(maxSuggestions);
+ IndexReader reader = IndexReader.open(directory, true);
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ BooleanQuery query = new BooleanQuery();
+ try {
+
+ // fuzzy name lookup
+ if (name != null) {
+ TokenStream ts = analyzer.tokenStream(lookupFieldName,
+ new StringReader(name));
+ TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
+ while (ts.incrementToken()) {
+ FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(
+ lookupFieldName, termAtt.term()), 0.8f);
+ // TODO: divide boost by number of terms
+ fuzzyQuery.setBoost(lookupBoost);
+ query.add(fuzzyQuery,
+ strictLookup ? BooleanClause.Occur.MUST
+ : BooleanClause.Occur.SHOULD);
+ }
+ }
+
+ // similarity context search
+ if (context != null) {
+ // TODO: use FuzzyLikeThisQuery instead?
+ // TODO: re-enable shingles once we can get rid of the "-"
+ // shingles
+ MoreLikeThisQuery mltQuery = new MoreLikeThisQuery(context,
+ likeFieldNames, getAnalyzer(false));
+ mltQuery.setPercentTermsToMatch(0.15f);
+ mltQuery.setMaxQueryTerms(20);
+ mltQuery.setMinTermFrequency(1);
+ mltQuery.setMinDocFreq(1);
+ mltQuery.setBoost(contextBoost);
+ query.add(mltQuery, BooleanClause.Occur.SHOULD);
+ }
+
+ // additional exact match filters
+ if (fieldFilters != null) {
+ for (Map.Entry<String, List<String>> fieldFilter : fieldFilters.entrySet()) {
+ for (String value : fieldFilter.getValue()) {
+ TermQuery tq = new TermQuery(new Term(
+ fieldFilter.getKey(), value));
+ // should not influence ranking, just filtering
+ tq.setBoost(0.0f);
+ query.add(tq, BooleanClause.Occur.MUST);
+ }
+ }
+ }
+ TopDocs hits = searcher.search(query, maxSuggestions);
+ ScoreDoc[] scoreDocs = hits.scoreDocs;
+ for (int i = 0; i < Math.min(maxSuggestions, hits.totalHits); i++) {
+ double confidence = scoreDocs[i].score;
+ if (confidence == 0.0) {
+ // this might happen with BooleanClause.Occur.SHOULD queries
+ continue;
+ }
+ Document d = searcher.doc(scoreDocs[i].doc);
+ String id = d.get(idField);
+ log.debug(String.format("entity '%s' matches with score %f",
+ id, confidence));
+
+ // assuming we are using DBPedia, we are extracting the label
+ // from the entity URI to avoid loading the lucene index with
+ // a stored label field
+ String label = URLDecoder.decode(id, "UTF-8");
+ label = label.substring(
+ "http://dbpedia.org/resource/".length(), label.length());
+ label = label.replace("_", " ");
+ TagInfo tag = new TagInfo(id, label,
+ d.getValues(typeFieldName), confidence);
+ suggestions.add(tag);
+ }
+ } finally {
+ reader.close();
+ searcher.close();
+ }
+ return suggestions;
+ }
+
+ /**
+ * Suggest entities that are fuzzy matching the given text (if short) or
+ * textually similar to the text (if long). Further restrict the results to
+ * match the type given either as full URI or DBpedia class name.
+ *
+ * @param name
+ * @param context
+ * @param type
+ * @return ranked entities info that best match
+ * @throws CorruptIndexException
+ * @throws IOException
+ */
+ public List<TagInfo> suggestForType(String text, String type)
+ throws CorruptIndexException, IOException {
+ Map<String, List<String>> fieldFilters = new HashMap<String, List<String>>();
+ if (!type.startsWith("http://")) {
+ type = typePrefix + type;
+ }
+ fieldFilters.put(typeFieldName, Arrays.asList(type));
+ return suggest(text, fieldFilters);
+ }
+
+ /**
+ * Suggest entities that are fuzzy matching the given name and/or textually
+ * similar to the given context. Further restrict the results to match the
+ * type given either as full URI or DBpedia class name.
+ *
+ * @param name
+ * @param context
+ * @param type
+ * @return ranked entities info that best match
+ * @throws CorruptIndexException
+ * @throws IOException
+ */
+ public List<TagInfo> suggestForType(String name, String context, String type)
+ throws CorruptIndexException, IOException {
+ Map<String, List<String>> fieldFilters = new HashMap<String, List<String>>();
+ if (type != null) {
+ if (!type.startsWith("http://")) {
+ type = typePrefix + type;
+ }
+ fieldFilters.put(typeFieldName, Arrays.asList(type));
+ }
+ return suggest(name, context, fieldFilters);
+ }
+
+
+ public String[] mostImportantTerms(String text) throws CorruptIndexException, IOException {
+ IndexReader reader = IndexReader.open(directory, true);
+ MoreLikeThis mlt = new MoreLikeThis(reader);
+ mlt.setFieldNames(likeFieldNames);
+ mlt.setAnalyzer(analyzer);
+ mlt.setMaxQueryTerms(maxSuggestions);
+ return mlt.retrieveInterestingTerms(new StringReader(text));
+ }
+}
\ No newline at end of file
Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,81 @@
+package eu.iksproject.autotagging;
+
+
+/**
+ * Simple data transfer object to hold the results of the Autotagger annotation
+ * process. This then can be mapped to a very simple RDF graph to publish the
+ * results annotations to third party applications.
+ *
+ * @author ogrisel
+ */
+public class TagInfo {
+
+ /**
+ * Unique ID of the entity that is related to the text content. This is
+ * typically the DBpedia unique URI of the entity.
+ */
+ private final String id;
+
+ /**
+ * Human readable label (or name) of the related entity.
+ */
+ private final String label;
+
+ /**
+ * Measure of the estimated quality of the suggestion, the bigger, the
+ * better. The actual range of values is data and implementation specific.
+ */
+ private final Double confidence;
+
+ /**
+ * List of types of the related entity. This typically a list of owl:Class
+ * from the DBpedia ontology (e.g. 'http://dbpedia.org/ontology/Person').
+ */
+ private final String[] type;
+
+ public TagInfo(String id, String label, String[] type, double confidence) {
+ if(id == null){
+ throw new IllegalArgumentException("Parameter id MUST NOT be NULL");
+ }
+ this.id = id;
+ this.label = label;
+ this.type = type;
+ this.confidence = confidence;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s [%f]", label, confidence);
+ }
+ /**
+ * Checks for != null, instanceof TagInfor and equals id
+ */
+ @Override
+ public boolean equals(Object obj) {
+ return obj != null && obj instanceof TagInfo && ((TagInfo)obj).id.equals(id) && ((TagInfo)obj).confidence.equals(confidence);
+ }
+ public final String getId() {
+ return id;
+ }
+
+ public final String getLabel() {
+ return label;
+ }
+
+ public final Double getConfidence() {
+ return confidence;
+ }
+
+ public final String[] getType() {
+ return type;
+ }
+
+ /**
+ * Implementation based on the id and confidence property
+ */
+ @Override
+ public int hashCode() {
+ return id.hashCode()+confidence.hashCode();
+ }
+
+}
Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,241 @@
+package eu.iksproject.autotagging.cli;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.tdb.TDBFactory;
+
+import eu.iksproject.autotagging.Autotagger;
+import eu.iksproject.autotagging.TagInfo;
+import eu.iksproject.autotagging.jena.ModelIndexer;
+import eu.iksproject.autotagging.jena.ModelResampler;
+
+/**
+ * Command line User Interface for importing RDF data into Jena models from
+ * dumps, sampling the relevant part and indexing the results with Lucene.
+ *
+ * @author ogrisel
+ */
+public class CommandLineRunner {
+
+ public static Options makeCommonOptions() {
+ Options options = new Options();
+ options.addOption("h", "help", false, "display this help and exit");
+ options.addOption("d", "debug", false,
+ "show debug stacktrace upon error");
+ return options;
+ }
+
+ public static void handleModel(String[] args) throws ParseException,
+ IOException {
+ CommandLineParser parser = new PosixParser();
+ Options options = makeCommonOptions();
+ CommandLine line = parser.parse(options, args);
+ args = line.getArgs();
+
+ if (args.length < 2 || line.hasOption("h")) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(
+ "model /path/to/tdb-model file.nt [file2.n3.gz file3.xml.bz2 ...]",
+ options);
+ System.exit(0);
+ }
+ String modelPath = args[1];
+ Model model = TDBFactory.createModel(modelPath);
+ for (String filename : Arrays.asList(args).subList(2, args.length)) {
+ System.out.printf("loading '%s' into model '%s'...", filename,
+ modelPath);
+ InputStream is = new FileInputStream(filename);
+
+ if (filename.endsWith(".gz")) {
+ is = new GZIPInputStream(is);
+ filename = filename.replaceFirst("\\.gz$", "");
+ } else if (filename.endsWith(".bz2")) {
+ is = new BZip2CompressorInputStream(is);
+ filename = filename.replaceFirst("\\.bz2$", "");
+ }
+
+ String format = null;
+ if (filename.endsWith(".nt")) {
+ format = "N-TRIPLE";
+ } else if (filename.endsWith(".n3")) {
+ format = "N3";
+ } // XML is the default format
+
+ model.read(is, null, format);
+ System.out.println(" done");
+ }
+ }
+
+ public static void handleResample(String[] args) throws ParseException {
+ CommandLineParser parser = new PosixParser();
+ Options options = makeCommonOptions();
+ Option maxTopResourcesOpt = new Option("t", "max-top-resources", true,
+ "maximum number of resources to sample");
+ maxTopResourcesOpt.setType(Integer.class);
+ options.addOption(maxTopResourcesOpt);
+ Option scoreFileOpt = new Option("s", "score-file", true,
+ "use TSV file holding ranked and scored resources");
+ options.addOption(scoreFileOpt);
+ CommandLine line = parser.parse(options, args);
+ boolean debug = line.hasOption("d");
+ args = line.getArgs();
+ if (args.length != 2 || line.hasOption("h")) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(
+ "resample /path/to/src-tdb-model /path/to/sampled-tdb-model",
+ options);
+ System.exit(0);
+ }
+ try {
+ int maxTopResources = Integer.parseInt(line.getOptionValue("t",
+ "10000"));
+ String scores = line.getOptionValue("s");
+ ModelResampler.resample(new File(args[0]), new File(args[1]),
+ new File(scores), maxTopResources);
+ } catch (Exception e) {
+ System.err.println(String.format("ERROR: %s - %s",
+ e.getClass().getSimpleName(), e.getMessage()));
+ if (debug) {
+ e.printStackTrace();
+ }
+ System.exit(5);
+ }
+ }
+
+ public static void handleIndex(String[] args) throws ParseException {
+ CommandLineParser parser = new PosixParser();
+ Options options = makeCommonOptions();
+ CommandLine line = parser.parse(options, args);
+ args = line.getArgs();
+ if (args.length < 2 || line.hasOption("h")) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(
+ "index /path/to/tdb-model /path/to/lucene-index", options);
+ System.exit(0);
+ }
+ try {
+ ModelIndexer.index(new File(args[0]), new File(args[1]));
+ } catch (Exception e) {
+ System.err.println("ERROR: " + e.getMessage());
+ System.exit(4);
+ }
+ }
+
+ public static void handleSuggest(String[] args) throws IOException,
+ ParseException {
+ CommandLineParser parser = new PosixParser();
+ Options options = makeCommonOptions();
+
+ options.addOption("i", "index", true,
+ "path to a specific lucene directory");
+
+ options.addOption("n", "name", true,
+ "restrict suggestions to lookup entities matching the provided name");
+
+ options.addOption("c", "context", true,
+ "restrict suggestions to entities similar to the provided context");
+
+ options.addOption("f", "context-file", true,
+ "restrict suggestions to entities similar to the provided utf-8 text file");
+
+ options.addOption("t", "type", true,
+ "restrict suggestions to entities of given type");
+
+ Option maxSuggestionsOpt = new Option("s", "max-suggestions", true,
+ "maximum number of suggestions");
+ maxSuggestionsOpt.setType(Integer.class);
+ options.addOption(maxSuggestionsOpt);
+
+ CommandLine line = parser.parse(options, args);
+ args = line.getArgs();
+ String name = line.getOptionValue("n");
+ String context = line.getOptionValue("c", "");
+ String contextFile = line.getOptionValue("f");
+
+ if (line.hasOption("h")
+ || (name == null && context == null && contextFile == null)) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(
+ "suggest --name \"John Smith\" --context-file smith-biography.txt ",
+ options);
+ System.exit(0);
+ }
+
+ String customIndex = line.getOptionValue("i");
+ Directory dir;
+ if (customIndex != null) {
+ dir = FSDirectory.open(new File(customIndex));
+ } else {
+ dir = FSDirectory.open(ModelIndexer.buildDefaultIndex());
+ }
+
+ int maxSuggestions = Integer.parseInt(line.getOptionValue("s", "3"));
+ Autotagger tagger = new Autotagger(dir).withMaxSuggestions(maxSuggestions);
+
+ if (contextFile != null) {
+ context += " ";
+ context = IOUtils.toString(new FileInputStream(
+ new File(contextFile)));
+ }
+ String type = line.getOptionValue("t");
+
+ System.out.printf("Computing suggestions...");
+ long startTime = System.currentTimeMillis();
+ List<TagInfo> suggestions = tagger.suggestForType(name, context, type);
+ System.out.printf(" done in %dms:\n",
+ (System.currentTimeMillis() - startTime));
+
+ for (int i = 0; i < suggestions.size(); i++) {
+ TagInfo tag = suggestions.get(i);
+ System.out.printf("Suggestion #%d (score: %f): '%s'\n", i + 1,
+ tag.getConfidence(), tag.getLabel());
+ System.out.printf("URI:\t%s\n", tag.getId());
+ for (String tagType : tag.getType()) {
+ System.out.printf("type:\t%s\n", tagType);
+ }
+ }
+ }
+
+ public static void main(String[] args) throws IOException, ParseException {
+ if (args.length < 1) {
+ System.out.println("expected command: model, resample, index or suggest");
+ System.exit(1);
+ }
+
+ String command = args[0];
+ String[] commandArgs = Arrays.copyOfRange(args, 1, args.length);
+
+ if (command.equals("model")) {
+ handleModel(commandArgs);
+ } else if (command.equals("resample")) {
+ handleResample(commandArgs);
+ } else if (command.equals("index")) {
+ handleIndex(commandArgs);
+ } else if (command.equals("suggest")) {
+ handleSuggest(commandArgs);
+ } else {
+ System.err.append("unknow command: " + args[0]);
+ System.exit(5);
+ }
+ }
+
+}
Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,283 @@
+package eu.iksproject.autotagging.jena;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.query.QuerySolution;
+import com.hp.hpl.jena.query.ResultSet;
+import com.hp.hpl.jena.rdf.model.Literal;
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.rdf.model.Property;
+import com.hp.hpl.jena.rdf.model.Resource;
+import com.hp.hpl.jena.rdf.model.Statement;
+import com.hp.hpl.jena.rdf.model.StmtIterator;
+import com.hp.hpl.jena.tdb.TDBFactory;
+
+import eu.iksproject.autotagging.Autotagger;
+
+/**
+ * Build a Lucene index out of a Jena model
+ *
+ * @author ogrisel
+ *
+ */
+public class ModelIndexer implements Closeable {
+
+ private static final Logger log = LoggerFactory.getLogger(ModelIndexer.class);
+
+ public static final String URI_FIELD = "uri";
+
+ public static final String DEFAULT_DBPEDIA_SAMPLE = "dbpedia/dbpedia-sample-10000.nt";
+
+ public static final String POPULARITY_SCORE_PROPERTY = "http://www.iksproject.eu/ns/popularity-score";
+
+ private final IndexWriter iwriter;
+
+ private final Model model;
+
+ // reduce GC load by reusing Document and Fields instances
+ private final Map<String, Field> literalFields = new HashMap<String, Field>();
+
+ private final Map<String, Map<String, Field>> uriFields = new HashMap<String, Map<String, Field>>();
+
+ private final Document doc = new Document();
+
+ private final Map<String, Float> boostedFields = new HashMap<String, Float>();
+
+ private final String scorePropertyUri = POPULARITY_SCORE_PROPERTY;
+
+ private Field getField(String property, String value, boolean isLiteral) {
+ // make the cache key
+ if (isLiteral) {
+ Field cachedField = literalFields.get(property);
+ if (cachedField == null) {
+ cachedField = new Field(property, value, Field.Store.NO,
+ Field.Index.ANALYZED);
+ literalFields.put(property, cachedField);
+ } else {
+ cachedField.setValue(value);
+ }
+ return cachedField;
+ } else {
+ // TODO: make sure that the multivalued URI properties take value in
+ // a limit size controlled vocabulary which is the case for types,
+ // but not for relations between entities
+ Map<String, Field> cachedFields = uriFields.get(property);
+ if (cachedFields == null) {
+ cachedFields = new HashMap<String, Field>();
+ uriFields.put(property, cachedFields);
+ }
+ Field cachedField = cachedFields.get(value);
+ if (cachedField == null) {
+ cachedField = new Field(property, value, Field.Store.YES,
+ Field.Index.NOT_ANALYZED);
+ cachedFields.put(value, cachedField);
+ }
+ return cachedField;
+ }
+ }
+
+ public ModelIndexer(final IndexWriter iwriter, final Model model) {
+ this.iwriter = iwriter;
+ this.model = model;
+
+ // by default boost the title (a.k.a. rdfs:label of the entity)
+ boostedFields.put("http://www.w3.org/2000/01/rdf-schema#label", 3.0f);
+ }
+
+ public Map<String, Float> getBoostedFields() {
+ return boostedFields;
+ }
+
+ public void close() throws IOException {
+ iwriter.close();
+ model.close();
+ }
+
+ public Iterator<Document> indexIterator() {
+ ModelResampler sampler = new ModelResampler();
+ final ResultSet resultSet = sampler.queryAllResources(model);
+ final Property scoreProperty = model.getProperty(scorePropertyUri);
+
+ return new Iterator<Document>() {
+
+ public boolean hasNext() {
+ return resultSet.hasNext();
+ }
+
+ public Document next() {
+ QuerySolution solution = resultSet.next();
+ Resource r = solution.getResource("resource");
+ StmtIterator stmts = model.listStatements(r, null, null, null);
+ doc.getFields().clear();
+ doc.add(getField(URI_FIELD, r.getURI(), false));
+ List<Statement> stmtList = stmts.toList();
+
+ // find document boost info if any
+ float docBoost = 1.0f;
+ Statement toDelete = null;
+ for (Statement stmt : stmtList) {
+ if (stmt.getPredicate().equals(scoreProperty)) {
+ docBoost = stmt.getFloat();
+ toDelete = stmt;
+ }
+ }
+ if (toDelete != null) {
+ stmtList.remove(toDelete);
+ }
+
+ // index all statement objects as lucene fields
+ for (Statement stmt : stmtList) {
+ String text;
+ boolean isLiteral = stmt.getObject().isLiteral();
+ if (isLiteral) {
+ text = stmt.getObject().as(Literal.class).getString();
+ } else if (stmt.getObject().isURIResource()) {
+ text = stmt.getObject().as(Resource.class).getURI();
+ } else {
+ // skip non indexable nodes (blank nodes, seqs, bags,
+ // ...)
+ continue;
+ }
+ Field field = getField(stmt.getPredicate().toString(),
+ text, isLiteral);
+ Float boost = boostedFields.get(field.name());
+ if (boost != null) {
+ field.setBoost(boost * docBoost);
+ } else {
+ field.setBoost(docBoost);
+ }
+ doc.add(field);
+ }
+ try {
+ iwriter.addDocument(doc);
+ } catch (Exception e) {
+ log.error("error indexing " + r.getURI(), e);
+ return doc;
+ }
+ return doc;
+
+ }
+
+ public void remove() {
+ throw new NotImplementedException();
+ }
+ };
+ }
+
+ public static void index(Model model, IndexWriter writer, boolean close)
+ throws IOException {
+ ModelIndexer indexer = new ModelIndexer(writer, model);
+ try {
+ log.info("computing the list of entities to process...");
+ long lastTime = System.currentTimeMillis();
+ Iterator<Document> iterator = indexer.indexIterator();
+ long newTime = System.currentTimeMillis();
+ log.info(String.format("query took %fs",
+ (newTime - lastTime) / 1000.));
+ lastTime = newTime;
+ int i = 1;
+ long checkpointSize = 5000;
+ while (iterator.hasNext()) {
+ Document doc = iterator.next();
+ if (i % checkpointSize == 0) {
+ writer.commit();
+ newTime = System.currentTimeMillis();
+ double duration = (newTime - lastTime) / 1000.;
+ log.info(String.format(
+ "indexed entity %09d '%s' at %f entities/s", i,
+ URLDecoder.decode(doc.get(URI_FIELD), "UTF-8"),
+ checkpointSize / duration));
+ lastTime = newTime;
+ }
+ i++;
+ }
+ writer.commit();
+ log.info(String.format(
+ "successfully indexed %09d entities, now optimizing the index",
+ i));
+ writer.optimize();
+ } finally {
+ if (close) {
+ indexer.close();
+ }
+ }
+ }
+
+ public static void index(File tdbModel, File fsDirectory)
+ throws CorruptIndexException, LockObtainFailedException,
+ IOException {
+ Model model = TDBFactory.createModel(tdbModel.getAbsolutePath());
+ index(model, fsDirectory);
+ }
+
+ public static void index(Model model, File fsDirectory)
+ throws CorruptIndexException, LockObtainFailedException,
+ IOException {
+ MaxFieldLength maxFieldLength = new MaxFieldLength(100000);
+ // TODO: re-enable shingles once we can get rid of the "-"
+ // shingles
+ IndexWriter writer = new IndexWriter(FSDirectory.open(fsDirectory),
+ Autotagger.getDefaultAnalyzer(), true, maxFieldLength);
+ writer.setRAMBufferSizeMB(42);
+ index(model, writer, true);
+ }
+
+ public static String DEFAULT_INDEX_DIRECTORY() {
+ return "default-iks-autotagging-idx";
+ }
+
+ public static File buildDefaultIndex() throws CorruptIndexException,
+ LockObtainFailedException, IOException {
+ return ModelIndexer.buildDefaultIndex(null, false);
+ }
+
+ public static File buildDefaultIndex(File folder, boolean deleteExisting)
+ throws CorruptIndexException, LockObtainFailedException,
+ IOException {
+ if (folder == null) {
+ folder = new File(System.getProperty("java.io.tmpdir"));
+ }
+ File fsDirectory = new File(folder, DEFAULT_INDEX_DIRECTORY());
+ if (deleteExisting) {
+ log.info("deleting default indexed model in: "
+ + fsDirectory.getAbsolutePath());
+ FileUtils.deleteDirectory(fsDirectory);
+ }
+ if (!fsDirectory.exists()) {
+ log.info("creating default indexed model in: "
+ + fsDirectory.getAbsolutePath());
+ InputStream stream = ModelIndexer.class.getClassLoader().getResourceAsStream(
+ DEFAULT_DBPEDIA_SAMPLE);
+ if (stream == null) {
+ throw new IOException("could not find resource: "
+ + DEFAULT_DBPEDIA_SAMPLE);
+ }
+ Model model = ModelFactory.createDefaultModel();
+ model.read(stream, null, "N-TRIPLE");
+ ModelIndexer.index(model, fsDirectory);
+ }
+ return fsDirectory;
+ }
+
+}
Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,325 @@
+package eu.iksproject.autotagging.jena;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.Iterator;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.NotImplementedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.query.Query;
+import com.hp.hpl.jena.query.QueryExecution;
+import com.hp.hpl.jena.query.QueryExecutionFactory;
+import com.hp.hpl.jena.query.QueryFactory;
+import com.hp.hpl.jena.query.QuerySolution;
+import com.hp.hpl.jena.query.QuerySolutionMap;
+import com.hp.hpl.jena.query.ResultSet;
+import com.hp.hpl.jena.query.Syntax;
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.rdf.model.Resource;
+import com.hp.hpl.jena.rdf.model.StmtIterator;
+import com.hp.hpl.jena.tdb.TDBFactory;
+
+/**
+ * Read a Jena model and extract the most popular resources given by a
+ * "tab separated values" files that holds the rank information of the
+ * resources.
+ *
+ * For instance such as file can be computed from the page links info of DBpedia
+ * using the corpusmaker toolkit: http://github.com/ogrisel/corpusmaker
+ *
+ * As it takes from 1h to 3h to compute such statistics, a precomputed file is
+ * available here:
+ *
+ * http://dl.dropbox.com/u/5743203/IKS/autotagging/incoming-counts-redirected.
+ * tsv.gz
+ *
+ * @author ogrisel
+ */
+public class ModelResampler {
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ protected int maxTopResources = 10000;
+
+ protected File tsvScoreFile;
+
+ public ModelResampler() {
+ // use default values
+ }
+
+ public ModelResampler withMaxTopResources(int maxTopResources) {
+ this.maxTopResources = maxTopResources;
+ return this;
+ }
+
+ public ModelResampler withPrecomputedScoresFile(File tsvRanksFile) {
+ this.tsvScoreFile = tsvRanksFile;
+ return this;
+ }
+
+ /**
+ * Perform a query that returns a result set iterating over all typed
+ * resource. The ordering of the results is undefined.
+ *
+ * @param model the model to query
+ * @return a result set where 'resource' is bound to the a resource
+ */
+ public ResultSet queryAllResources(Model model) {
+ QuerySolution mapping = new QuerySolutionMap();
+ StringBuilder qb = new StringBuilder();
+ qb.append("SELECT distinct ?resource ");
+ qb.append("{ ");
+ qb.append(" ?resource a ?type . ");
+ qb.append(" FILTER ( isURI(?resource) ) . ");
+ qb.append("} ");
+ Query q = QueryFactory.create(qb.toString(), Syntax.syntaxARQ);
+ QueryExecution qexec = QueryExecutionFactory.create(q, model, mapping);
+ return qexec.execSelect();
+ }
+
+ /**
+ * Perform a query to find the top popular resources by counting incoming
+ * links. The score values are normalized (the most popular resource as a
+ * score of 1.0, unless all scores are 0.0).
+ *
+ * @param model the model to query
+ * @return a result set where 'resource' is bound to a popular resource
+ */
+ public Iterator<ResourceInfo> queryTopResources(Model model) {
+ QuerySolution mapping = new QuerySolutionMap();
+ StringBuilder qb = new StringBuilder();
+ qb.append("SELECT ?resource ( count(?incoming) AS ?count ) ");
+ qb.append("{ ");
+ qb.append(" ?resource a ?type . ");
+ qb.append(" OPTIONAL { ?incoming ?relationship ?resource . } . ");
+ qb.append(" FILTER ( isURI(?resource) ) . ");
+ qb.append("} ");
+ qb.append("GROUP BY ?resource ");
+ qb.append("ORDER BY DESC ( ?count ) ");
+ qb.append(String.format("OFFSET 0 LIMIT %d", maxTopResources));
+ Query q = QueryFactory.create(qb.toString(), Syntax.syntaxARQ);
+ final ResultSet resultSet = QueryExecutionFactory.create(q, model,
+ mapping).execSelect();
+ return new Iterator<ResourceInfo>() {
+
+ long sampled = 0;
+
+ double maxScore = 1.0;
+
+ public boolean hasNext() {
+ return resultSet.hasNext();
+ }
+
+ public ResourceInfo next() {
+ QuerySolution nextSolution = resultSet.nextSolution();
+ double count = nextSolution.getLiteral("count").getDouble();
+ double score = Math.log1p(count);
+ if (sampled == 0 && count > 0) {
+ maxScore = score;
+ }
+ sampled++;
+ return new ResourceInfo(nextSolution.getResource("resource"),
+ score / maxScore);
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+
+ @SuppressWarnings("unchecked")
+ public Iterator<ResourceInfo> findTopResources(final Model model)
+ throws FileNotFoundException, IOException {
+ if (tsvScoreFile == null) {
+ return queryTopResources(model);
+ }
+ final Iterator<String> lines = IOUtils.lineIterator(
+ new FileInputStream(tsvScoreFile), "utf-8");
+ return new Iterator<ResourceInfo>() {
+
+ double maxScore = 1.0;
+
+ int sampled = 0;
+
+ ResourceInfo nextRi = null;
+
+ protected ResourceInfo fetchNext(boolean andForget) {
+ ResourceInfo result = nextRi;
+ if (result == null) {
+ if (lines.hasNext()) {
+ String line = lines.next();
+ String[] parts = line.split("\t");
+ if (parts.length != 2) {
+ log.warn(String.format("skipping line: '%s'", line));
+ return fetchNext(andForget);
+ }
+ double score = Double.parseDouble(parts[1].trim());
+ // take the log to avoid over popular entities to
+ // dominate the results (attenuate the Zipf law of
+ // culturally generated distribution)
+ score = Math.log1p(score);
+ if (sampled == 0 && score > 0) {
+ maxScore = score;
+ }
+ String resource = parts[0].trim();
+ if (!resource.startsWith("http://")) {
+ resource = "http://dbpedia.org/resource/"
+ + resource;
+ }
+ Resource r = model.createResource(resource);
+ if (!model.containsResource(r)) {
+ log.debug(String.format(
+ "skipping resource: '%s', not found in model",
+ resource));
+ return fetchNext(andForget);
+ }
+ result = new ResourceInfo(r, score / maxScore);
+ }
+ }
+ nextRi = andForget ? null : result;
+ return result;
+ }
+
+ public boolean hasNext() {
+ return sampled < maxTopResources && fetchNext(false) != null;
+ }
+
+ public ResourceInfo next() {
+ ResourceInfo next = fetchNext(true);
+ sampled++;
+ return next;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+
+ /**
+ * Iteratively sample statements carried by popular resources of sourceModel
+ * into targetModel.
+ *
+ * @param sourceModel model to sample popular resources from
+ * @param targetModel model to save resource attributes to
+ * @return an iterator over popular resource to monitor progress
+ * @throws IOException
+ * @throws FileNotFoundException
+ */
+ public Iterator<ResourceInfo> samplerIterator(final Model sourceModel,
+ final Model targetModel) throws FileNotFoundException, IOException {
+ final Iterator<ResourceInfo> topResources = findTopResources(sourceModel);
+ return new Iterator<ResourceInfo>() {
+
+ public boolean hasNext() {
+ return topResources.hasNext();
+ }
+
+ public ResourceInfo next() {
+ ResourceInfo ri = topResources.next();
+ StmtIterator stmts = sourceModel.listStatements(ri.resource,
+ null, null, null);
+ targetModel.add(stmts);
+ targetModel.add(targetModel.createLiteralStatement(
+ ri.resource,
+ targetModel.getProperty(ModelIndexer.POPULARITY_SCORE_PROPERTY),
+ ri.score));
+ return ri;
+ }
+
+ public void remove() {
+ throw new NotImplementedException();
+ }
+ };
+ }
+
+ /**
+ * Extract the most popular resources ranked by incoming relation into s *
+ * targetModel.
+ *
+ * @param sourceModel model to extract popular resource from
+ * @param targetModel model where to save the extracted resources data
+ *
+ * @throws IOException
+ * @throws FileNotFoundException
+ */
+ public void extractMostPopular(Model sourceModel, Model targetModel)
+ throws FileNotFoundException, IOException {
+ log.info("computing the list of resources to sample...");
+ long lastTime = System.currentTimeMillis();
+ Iterator<ResourceInfo> iterator = samplerIterator(sourceModel,
+ targetModel);
+ long newTime = System.currentTimeMillis();
+ log.info(String.format("query took %fs", (newTime - lastTime) / 1000.));
+ lastTime = newTime;
+ int i = 1;
+ long checkpointSize = 5000;
+ while (iterator.hasNext()) {
+ ResourceInfo ri = iterator.next();
+ if (i == 1 && ri.score == 0.0f) {
+ log.warn(String.format(
+ "most popular resource '%s' has a score of 0.0...",
+ ri.resource.getURI()));
+ }
+ if (i % checkpointSize == 0) {
+ newTime = System.currentTimeMillis();
+ double duration = (newTime - lastTime) / 1000.;
+ String uri = ri.resource.getURI();
+ try {
+ log.info(String.format(
+ "sampled resource %09d (at '%s' with score %f) - %f entities/s",
+ i, URLDecoder.decode(uri, "UTF-8"), ri.score,
+ checkpointSize / duration));
+ } catch (UnsupportedEncodingException e) {
+ log.warn(String.format("invalid URI '%s': %s", uri,
+ e.getMessage()));
+ }
+ lastTime = newTime;
+ }
+ i++;
+ }
+ log.info(String.format("successfully sampled %09d resources", i));
+ }
+
+ public static void resample(File srcTdbFolder, File targetFile,
+ File scoresFile, int maxTopResources) throws IOException {
+ Model sourceModel = TDBFactory.createModel(srcTdbFolder.getAbsolutePath());
+
+ String filename = targetFile.getName();
+ String format = null;
+ boolean useTemporaryModel = false;
+ if (filename.endsWith(".nt")) {
+ format = "N-TRIPLE";
+ useTemporaryModel = true;
+ } else if (filename.endsWith(".n3")) {
+ format = "N3";
+ useTemporaryModel = true;
+ } else if (filename.endsWith(".xml")) {
+ // format = null will use the XML syntax
+ useTemporaryModel = true;
+ }
+
+ // TODO: use a temporary TDB model in a temporary directory instead of
+ // a memory model that lacks scalability?
+ Model targetModel = useTemporaryModel ? ModelFactory.createDefaultModel()
+ : TDBFactory.createModel(targetFile.getAbsolutePath());
+
+ ModelResampler sampler = new ModelResampler().withMaxTopResources(
+ maxTopResources).withPrecomputedScoresFile(scoresFile);
+ sampler.extractMostPopular(sourceModel, targetModel);
+
+ if (useTemporaryModel) {
+ targetModel.write(new FileOutputStream(targetFile), format, null);
+ }
+ }
+}
Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,16 @@
+package eu.iksproject.autotagging.jena;
+
+import com.hp.hpl.jena.rdf.model.Resource;
+
+public class ResourceInfo {
+
+ public final Resource resource;
+
+ public final Double score;
+
+
+ public ResourceInfo(Resource resource, Double score) {
+ this.resource = resource;
+ this.score = score;
+ }
+}
Added: incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF Thu Dec 2 11:30:36 2010
@@ -0,0 +1,5 @@
+Manifest-Version: 1.0
+Bundle-ManifestVersion: 2
+Bundle-Name: IKS Autotagging
+Bundle-SymbolicName: eu.iksproject.autotagging
+
Added: incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,194 @@
+package eu.iksproject.autotagging;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+
+import eu.iksproject.autotagging.jena.ModelIndexer;
+
+public class AutotaggingTest {
+
+ private Model model;
+
+ private RAMDirectory ramDirectory;
+
+ private StandardAnalyzer analyzer;
+
+ private IndexWriter writer;
+
+ protected File defaultIndexDirectory;
+
+ public static InputStream getResource(String name) {
+ InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(
+ name);
+ assertNotNull("failed to load resource " + name, stream);
+ return stream;
+ }
+
+ @BeforeClass
+ public static void setUpDefaultIndex() throws Exception {
+ // create index from scratch
+ ModelIndexer.buildDefaultIndex(null, true);
+ }
+
+ @Before
+ public void setUp() throws CorruptIndexException,
+ LockObtainFailedException, IOException {
+ model = ModelFactory.createDefaultModel();
+ ramDirectory = new RAMDirectory();
+ analyzer = new StandardAnalyzer(Version.LUCENE_30);
+ writer = new IndexWriter(ramDirectory, analyzer, true,
+ new IndexWriter.MaxFieldLength(25000));
+ model.read(getResource("dbpedia_3.4_instancetype_en.nt"), null,
+ "N-TRIPLE");
+ model.read(getResource("dbpedia_3.4_longabstract_en.nt"), null,
+ "N-TRIPLE");
+ // will reuse the index built by setUpDefaultIndex
+ defaultIndexDirectory = ModelIndexer.buildDefaultIndex();
+ }
+
+ @Test
+ public void testIndexing() throws IOException, ParseException {
+ // index model without closing it since it is memory only
+ ModelIndexer.index(model, writer, false);
+ writer.close();
+
+ // perform a query on the fulltext content of the abstracts in the model
+ IndexSearcher isearcher = new IndexSearcher(ramDirectory, true); // read-only=true
+ QueryParser parser = new QueryParser(Version.LUCENE_30,
+ "http://dbpedia.org/property/abstract", analyzer);
+ Query query = parser.parse("1981");
+ ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+
+ // check that the match point to the expected entity
+ Document hitDoc = isearcher.doc(hits[0].doc);
+ assertEquals("http://dbpedia.org/resource/%21Action_Pact%21",
+ hitDoc.get(ModelIndexer.URI_FIELD));
+ String[] types = hitDoc.getValues("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
+ assertEquals(3, types.length);
+ isearcher.close();
+ }
+
+ @Test
+ public void testAutotaggingWithCustomIndex() throws IOException, ParseException {
+ // index the model
+ testIndexing();
+
+ // perform a suggestion query
+ Autotagger autotagger = new Autotagger(ramDirectory);
+ List<TagInfo> tags = autotagger.suggest("The punk side in me is telling me to listen to the british band Action Pact.");
+ assertTrue(!tags.isEmpty());
+
+ assertEquals("http://dbpedia.org/resource/%21Action_Pact%21",
+ tags.get(0).getId());
+ assertEquals("!Action Pact!", tags.get(0).getLabel());
+ assertEquals(3, tags.get(0).getType().length);
+ assertEquals(0.59, tags.get(0).getConfidence(), 0.1f);
+ assertEquals("http://dbpedia.org/ontology/Band", tags.get(0).getType()[0]);
+ assertEquals("http://dbpedia.org/ontology/Organisation",
+ tags.get(0).getType()[1]);
+ assertEquals("http://www.w3.org/2002/07/owl#Thing", tags.get(0).getType()[2]);
+ }
+
+ @Test
+ public void testAutotaggingWithDefaultIndex() throws IOException,
+ ParseException {
+
+ // build a tagger using a the default DBpedia based index
+ Directory dir = FSDirectory.open(defaultIndexDirectory);
+ Autotagger autotagger = new Autotagger(dir);
+
+ // perform a context similarity search for a Person
+ String context = "Let the autotagger guess who was a Jamaican"
+ + " musician, a lead singer and guitarist"
+ + " for a well known reggae band.";
+
+ List<TagInfo> tags = autotagger.suggestForType(context, "Person");
+ assertTrue(!tags.isEmpty());
+ TagInfo bestGuess = tags.get(0);
+
+ assertEquals("http://dbpedia.org/resource/Bob_Marley", bestGuess.getId());
+ assertEquals("Bob Marley", bestGuess.getLabel());
+
+ List<String> types = Arrays.asList(bestGuess.getType());
+ assertEquals(4, types.size());
+ assertTrue(types.contains("http://www.w3.org/2002/07/owl#Thing"));
+ assertTrue(types.contains("http://dbpedia.org/ontology/Person"));
+ assertTrue(types.contains("http://dbpedia.org/ontology/Artist"));
+ assertTrue(types.contains("http://dbpedia.org/ontology/MusicalArtist"));
+ }
+
+ @Test
+ public void testEntityByNameWithContext() throws IOException {
+
+ // build a tagger using a the default DBpedia based index
+ Directory dir = FSDirectory.open(defaultIndexDirectory);
+ Autotagger autotagger = new Autotagger(dir);
+
+ // fuzzy lookup by entity name
+ String name = "the city of Paris";
+
+ // TODO: find an entity where the context can help filter out ambiguity
+ String context = "The river Seine flows in the city of Paris ";
+
+ // strict name lookup (by default)
+ List<TagInfo> tags = autotagger.suggestForType(name, context, "Place");
+ assertTrue(tags.isEmpty());
+
+ // lax name lookup
+ tags = autotagger.withStrictNameLookup(false).suggestForType(name,
+ context, "Place");
+ assertTrue(!tags.isEmpty());
+ assertEquals("http://dbpedia.org/resource/Paris", tags.get(0).getId());
+ assertEquals("Paris", tags.get(0).getLabel());
+ }
+
+ @Test
+ public void testEntityByNameWithoutContext() throws IOException {
+
+ // build a tagger using a the default DBpedia based index
+ Directory dir = FSDirectory.open(defaultIndexDirectory);
+ Autotagger autotagger = new Autotagger(dir).withStrictNameLookup(false);
+
+ // at least of one of the terms is matching
+ String name = "The city of Paris";
+
+ List<TagInfo> tags = autotagger.suggestForType(name, "Place");
+ assertTrue(!tags.isEmpty());
+ assertEquals("http://dbpedia.org/resource/Paris", tags.get(0).getId());
+
+ // try with non existing name
+ name = "somethingnot referencedin theindex";
+ tags = autotagger.suggestForType(name, "Place");
+ assertTrue(tags.isEmpty());
+ }
+
+}
Added: incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java Thu Dec 2 11:30:36 2010
@@ -0,0 +1,105 @@
+package eu.iksproject.autotagging;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.rdf.model.Property;
+import com.hp.hpl.jena.rdf.model.Resource;
+
+import eu.iksproject.autotagging.jena.ModelIndexer;
+import eu.iksproject.autotagging.jena.ModelResampler;
+import eu.iksproject.autotagging.jena.ResourceInfo;
+
+public class ModelResamplerTest {
+
+ protected Model srcModel;
+
+ protected Model targetModel;
+
+ protected Resource[] r;
+
+ protected Property p;
+
+ protected Property score;
+
+ protected Resource personClass;
+
+ protected Property type;
+
+ @Before
+ public void makeModels() {
+ srcModel = ModelFactory.createDefaultModel();
+ targetModel = ModelFactory.createDefaultModel();
+
+ // create properties and resources
+ p = srcModel.createProperty("urn:p");
+ score = srcModel.createProperty(ModelIndexer.POPULARITY_SCORE_PROPERTY);
+ type = srcModel.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
+ personClass = srcModel.createResource("http://dbpedia.org/ontology/Person");
+ r = new Resource[10];
+
+ for (int i = 0; i < r.length; i++) {
+ r[i] = srcModel.createResource(String.format("urn:%d", i));
+ srcModel.add(r[i], type, personClass);
+ }
+
+ // connect resources with p
+ srcModel.add(r[0], p, r[1]);
+ srcModel.add(r[2], p, r[1]);
+ srcModel.add(r[3], p, r[1]);
+ srcModel.add(r[9], p, r[1]);
+
+ srcModel.add(r[0], p, r[2]);
+ srcModel.add(r[4], p, r[2]);
+ srcModel.add(r[9], p, r[2]);
+
+ srcModel.add(r[1], p, r[5]);
+ srcModel.add(r[4], p, r[5]);
+
+ srcModel.add(r[8], p, r[4]);
+
+ srcModel.add(r[4], p, r[8]);
+
+ }
+
+ @Test
+ public void testResampling() throws FileNotFoundException, IOException {
+ ModelResampler sampler = new ModelResampler().withMaxTopResources(2);
+ Iterator<ResourceInfo> samplerIterator = sampler.samplerIterator(
+ srcModel, targetModel);
+
+ assertTrue(samplerIterator.hasNext());
+ ResourceInfo ri = samplerIterator.next();
+ assertEquals(r[1], ri.resource);
+ double r1Score = ri.score.doubleValue();
+ assertEquals(1.0, r1Score, 0.01);
+
+ assertTrue(samplerIterator.hasNext());
+ ri = samplerIterator.next();
+ assertEquals(r[2], ri.resource);
+ double r2Score = ri.score.doubleValue();
+ assertEquals(0.86, r2Score, 0.01);
+
+ assertFalse(samplerIterator.hasNext());
+
+ assertEquals(6, targetModel.size());
+
+ assertTrue(targetModel.contains(r[1], type, personClass));
+ assertTrue(targetModel.contains(r[1], p, r[5]));
+ assertTrue(targetModel.containsLiteral(r[1], score, r1Score));
+
+ assertTrue(targetModel.contains(r[2], type, personClass));
+ assertTrue(targetModel.contains(r[2], p, r[1]));
+ assertTrue(targetModel.containsLiteral(r[2], score, r2Score));
+ }
+}