You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2010/12/02 12:30:37 UTC

svn commit: r1041331 [1/2] - in /incubator/stanbol/trunk/iks-autotagging: ./ samples/ src/ src/main/ src/main/java/ src/main/java/eu/ src/main/java/eu/iks/ src/main/java/eu/iksproject/ src/main/java/eu/iksproject/autotagging/ src/main/java/eu/iksprojec...

Author: ogrisel
Date: Thu Dec  2 11:30:36 2010
New Revision: 1041331

URL: http://svn.apache.org/viewvc?rev=1041331&view=rev
Log:
temporary import of iks-autotagging that will eventually be replaced by an equivalent implementation in rick

Added:
    incubator/stanbol/trunk/iks-autotagging/
    incubator/stanbol/trunk/iks-autotagging/LICENSE.txt
    incubator/stanbol/trunk/iks-autotagging/README.txt
    incubator/stanbol/trunk/iks-autotagging/pom.xml
    incubator/stanbol/trunk/iks-autotagging/samples/
    incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt
    incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt
    incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt
    incubator/stanbol/trunk/iks-autotagging/src/
    incubator/stanbol/trunk/iks-autotagging/src/main/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iks/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java
    incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java
    incubator/stanbol/trunk/iks-autotagging/src/main/resources/
    incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/
    incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF
    incubator/stanbol/trunk/iks-autotagging/src/test/
    incubator/stanbol/trunk/iks-autotagging/src/test/java/
    incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/
    incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/
    incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/
    incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java
    incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java
    incubator/stanbol/trunk/iks-autotagging/src/test/resources/
    incubator/stanbol/trunk/iks-autotagging/src/test/resources/dbpedia_3.4_instancetype_en.nt
    incubator/stanbol/trunk/iks-autotagging/src/test/resources/dbpedia_3.4_longabstract_en.nt

Added: incubator/stanbol/trunk/iks-autotagging/LICENSE.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/LICENSE.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/LICENSE.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/LICENSE.txt Thu Dec  2 11:30:36 2010
@@ -0,0 +1,25 @@
+Copyright 2010 IKS Consortium. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this list of
+      conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, this list
+      of conditions and the following disclaimer in the documentation and/or other materials
+      provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY IKS CONSORTIUM ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IKS CONSORTIUM OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those of the
+authors and should not be interpreted as representing official policies, either expressed
+or implied, of IKS Consortium.

Added: incubator/stanbol/trunk/iks-autotagging/README.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/README.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/README.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/README.txt Thu Dec  2 11:30:36 2010
@@ -0,0 +1,184 @@
+IKS Autotagging
+===============
+
+:author: ogrisel@nuxeo.com
+
+Text document classification / topic assignment service based on the text
+content of DBpedia. The implementation is based on lucene and the
+`MoreLikeThis` similarity query that leverages term frequencies of the index.
+
+
+Building
+========
+
+1- Download maven_ and ensure that the `mvn` command is registered in your
+`PATH` environment variable.
+
+2- From the top of the `iks-autoagging/` folder build using maven as usual (this
+   will install the `iks-autotagging-X.X.X-SNAPSHOT.jar` jar in your local maven
+   repository and make it available to dependent projects such as FISE)::
+
+  % mvn install
+
+3- From the same folder, build a standalone jar suitable for commandline
+   usage. The resulting jar will be named:
+   `target/iks-autotagging-X.X.X-SNAPSHOT-jar-with-dependencies.jar`::
+
+  % mvn assembly:assembly
+
+4- (Optional) To import the project in eclipse, first run::
+
+  % mvn eclipse:eclipse
+
+This will generate `.project` and `.classpath` files to that you can
+"Import > Import existing projects into workspace" from the Eclipse
+UI. Alternatively you can install the `m2eclipse` plugin to directly
+import maven projects into eclipse.
+
+.. _maven: http://maven.apache.org
+
+
+Command line usage
+==================
+
+You can use the autotagger with a default embedded lucene index of
+the top 10000 entities of DBpedia:
+
+  % java -jar target/iks-autotagging-*-SNAPSHOT-jar-with-dependencies.jar \
+    suggest -f samples/bob_marley.txt
+  [...]
+  Annotating 'samples/bob_marley.txt'... done in 739ms:
+  Suggestion #1 (score: 4.648216): 'Bob Marley'
+  URI:	http://dbpedia.org/resource/Bob_Marley
+  type:	http://www.w3.org/2002/07/owl#Thing
+  type:	http://dbpedia.org/ontology/Person
+  type:	http://dbpedia.org/ontology/Artist
+  type:	http://dbpedia.org/ontology/MusicalArtist
+  Suggestion #2 (score: 0.127039): 'Bunny Wailer'
+  URI:	http://dbpedia.org/resource/Bunny_Wailer
+  type:	http://www.w3.org/2002/07/owl#Thing
+  type:	http://dbpedia.org/ontology/Person
+  type:	http://dbpedia.org/ontology/Artist
+  type:	http://dbpedia.org/ontology/MusicalArtist
+  Suggestion #3 (score: 0.121009): 'Desmond Dekker'
+  URI:	http://dbpedia.org/resource/Desmond_Dekker
+  type:	http://www.w3.org/2002/07/owl#Thing
+  type:	http://dbpedia.org/ontology/Person
+  type:	http://dbpedia.org/ontology/Artist
+  type:	http://dbpedia.org/ontology/MusicalArtist
+
+For better recall performance it is strongly recommended
+to use a more comprehensive index of DBpedia entities.
+
+To do so you first need to build or download a dedicated
+DBpedia lucene index in a folder named `/path/to/lucene-idx` (for instance)
+on the local filesystem (see later sections for instructions). You can download
+a prebuilt index from here:
+
+  http://dl.dropbox.com/u/5743203/IKS/autotagging/iks-dbpedia-lucene-idx-20100331-0.tar.bz2
+
+(A better index is currently under construction...)
+
+You can then add the "-i /path/to/lucene-idx" option to the
+previous command line to use your custom index.
+
+Instructions for building your own index from scratch are available in the following
+sections.
+
+
+Restful API
+===========
+
+:TODO: implement me first!
+
+Launch a HTTP server to provide the service using a RESTful API thanks to jetty
+and Jersey::
+
+  % mvn jetty:run
+  % curl -T file-to-anotate.txt http://localhost:8080/autotagging
+
+RDF/JSON annotations could be serialized using this convents: http://jdil.org/.
+
+Also the FISE project features an OSGi embedding of this library combined with
+RESTful interface and persistent annotation and content stores::
+
+  http://code.google.com/p/iks-project/source/browse/sandbox/fise/trunk/
+
+
+Building a lucene index from DBpedia dumps
+==========================================
+
+1- Download and uncompress (using `bzip2 -d <filename>`)the following datasets from DBpedia:
+
+  - instancetype_en.nt.bz2_
+
+  - longabstract_en_nt.bz2_
+
+  - article_label_en_nt.bz2_
+
+.. _instancetype_en_nt.bz2: http://downloads.dbpedia.org/3.4/en/instancetype_en.nt.bz2
+.. _longabstract_en_nt.bz2: http://downloads.dbpedia.org/3.4/en/longabstract_en.nt.bz2
+.. _article_label_en_nt.bz2: http://downloads.dbpedia.org/3.4/en/article_label.nt.bz2
+
+
+2- Build a temporary Jena TDB store::
+
+  % java -Xmx2g -server -jar target/iks-autotagging-*-SNAPSHOT-jar-with-dependencies.jar \
+    model /path/to/dbpedia-tdb /path/to/instancetype_en.nt /path/to/longabstract.nt /path/to/articles_label_en.nt
+
+Alternatively you can download and use the `bin/tdbloader` tool from the TDB
+distribution.
+
+3- Index the Jena TDB into a Lucene `FSDirectory`::
+
+  % java -Xmx2g -server -jar target/iks-autotagging-*-SNAPSHOT-jar-with-dependencies.jar \
+    index /path/to/dbpedia-tdb /path/to/lucene-directory
+
+You can then use luke_ to check the content of the resulting index::
+
+  % java -jar /path/to/lukeall-1.0.0.jar -index /path/to/lucene-idx
+
+.. _luke: http://www.getopt.org/luke/
+
+
+Recently implemented
+====================
+
+0- Finish implementing the `JenaIndexer#main` method to be able to create a Jena
+   TDB store out of DBpedia dumps from the command line.
+
+1- Use the lucene `ShingleFilter` to generate bi-grams (or tri-grams) of token
+   and improve the accuracy of the results at the expense of the size of the
+   index.
+
+2- Extend the `Autotagger` API to allow the requester to ask for a
+   specific entity type (useful to combine with the output of a Named Entity
+   detection module).
+
+4- Extend the `TagInfo` class to feedback the caller with confidence levels
+   for each suggestions (from lucene scores).
+
+5- Improve the `JenaIndexer` to index other text literal sources such as labels,
+   comments, ...
+
+6- Build a small index of the most popular people / place / organization from
+   DBpedia to be packaged easily as the default IKS model
+
+
+Roadmap
+=======
+
+3- Implement standalone jersey-based JAX-RS components that takes the text
+   content as an input and output suggested annotations as RDF/XML of RDF/JSON
+   synchronously.
+
+7- Index the DBpedia categories dumps and by aggregating literal text from
+   directly related entities and propose suggestions with type topic to
+   complement entities typed tags.
+
+8- Use a complete wikimedia markup dump as a fulltext source for the index
+   instead of just DBpedia
+
+9- Index the textual context (enclosing paragraph) of all incoming links
+   to an entity coming from other wikipedia articles.
+

Added: incubator/stanbol/trunk/iks-autotagging/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/pom.xml?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/pom.xml (added)
+++ incubator/stanbol/trunk/iks-autotagging/pom.xml Thu Dec  2 11:30:36 2010
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project>
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>eu.iksproject</groupId>
+  <artifactId>iks-autotagging</artifactId>
+  <version>0.1.0-SNAPSHOT</version>
+  <name>iks-autotagging</name>
+  <description>Service to assign DBpedia-based resources to unstructured
+    text content as related entities (Person, Place, Organization) or
+    topics (a.k.a.wikipedia categories).</description>
+  <repositories>
+  <repository>
+    <id>central</id>
+    <url>http://repo1.maven.org/maven2</url>
+  </repository>
+  <repository>
+    <!--
+      needed for the default model data while we decide where to put IKS
+      artifacts
+    -->
+    <id>nuxeo-vendor-release</id>
+    <url>https://maven.nuxeo.org/nexus/content/repositories/vendor-releases</url>
+  </repository>
+  </repositories>
+  <dependencies>
+    <dependency>
+      <groupId>eu.iksproject</groupId>
+      <artifactId>iks-autotagging-data</artifactId>
+      <version>0.1.2</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-cli</groupId>
+      <artifactId>commons-cli</artifactId>
+      <version>1.2</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-lang</groupId>
+      <artifactId>commons-lang</artifactId>
+      <version>2.4</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>1.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-core</artifactId>
+      <version>3.0.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-queries</artifactId>
+      <version>3.0.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers</artifactId>
+      <version>3.0.1</version>
+    </dependency>
+    <dependency>
+      <groupId>com.hp.hpl.jena</groupId>
+      <artifactId>jena</artifactId>
+      <version>2.6.2</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.hp.hpl.jena</groupId>
+      <artifactId>arq</artifactId>
+      <version>2.8.2</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.sun.jmx</groupId>
+          <artifactId>jmxri</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.jms</groupId>
+          <artifactId>jms</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jdmk</groupId>
+          <artifactId>jmxtools</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.hp.hpl.jena</groupId>
+      <artifactId>tdb</artifactId>
+      <version>0.8.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.5.8</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <version>1.5.8</version>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.7</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <source>1.5</source>
+          <target>1.5</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>2.2-beta-5</version>
+        <configuration>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+          <archive>
+            <manifest>
+              <mainClass>eu.iksproject.autotagging.cli.CommandLineRunner</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>

Added: incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/samples/bob_marley.txt Thu Dec  2 11:30:36 2010
@@ -0,0 +1,15 @@
+Robert Nesta "Bob" Marley (February 6, 1945 – May 11, 1981) was
+a Jamaican singer-songwriter and musician. He was the lead singer,
+songwriter and guitarist for the ska, rocksteady and reggae bands The
+Wailers (1964–1974) and Bob Marley & The Wailers (1974–1981). Marley
+remains the most widely known and revered performer of reggae music,
+and is credited for helping spread both Jamaican music and the Rastafari
+movement to a worldwide audience.
+
+Marley's best known hits include "I Shot the Sheriff", "No Woman, No
+Cry", "Could You Be Loved", "Stir It Up", "Jamming", "Redemption Song",
+"One Love" and, together with The Wailers, "Three Little Birds", as well
+as the posthumous releases "Buffalo Soldier" and "Iron Lion Zion". The
+compilation album, Legend (1984), released three years after his death,
+is reggae's best-selling album, being 10 times Platinum (Diamond) in
+the U.S., and selling 20 million copies worldwide.

Added: incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/samples/jimi_hendrix.txt Thu Dec  2 11:30:36 2010
@@ -0,0 +1,36 @@
+James Marshall "Jimi" Hendrix (born Johnny Allen Hendrix; November
+27, 1942 – September 18, 1970) was an American guitarist, singer and
+songwriter. He is often considered to be the greatest electric guitarist
+in the history of rock music by other musicians and commentators in
+the industry, and one of the most important and influential musicians
+of his era across a range of genres. After initial success in Europe,
+he achieved fame in the United States following his 1967 performance
+at the Monterey Pop Festival. Later, Hendrix headlined the iconic
+1969 Woodstock Festival and the 1970 Isle of Wight Festival. Hendrix
+often favored raw overdriven amplifiers with high gain and treble and
+helped develop the previously undesirable technique of guitar amplifier
+feedback. Hendrix was one of the musicians who popularized the wah-wah
+pedal in mainstream rock which he often used to deliver an exaggerated
+pitch in his solos, particularly with high bends and use of legato based
+around the pentatonic scale. He was influenced by blues artists such as
+B.B. King, Muddy Waters, Howlin' Wolf, Albert King, and Elmore James,
+rhythm and blues and soul guitarists Curtis Mayfield, Steve Cropper, as
+well as by some modern jazz. In 1966, Hendrix, who played and recorded
+with Little Richard's band from 1964 to 1965, said, "I want to do with
+my guitar what Little Richard does with his voice."
+
+As a record producer, Hendrix also broke new ground in using the recording
+studio as an extension of his musical ideas. He was one of the first to
+experiment with stereophonic and phasing effects for rock recording.
+
+Hendrix won many of the most prestigious rock music awards in his
+lifetime, and has been posthumously awarded many more, including being
+inducted into the US Rock and Roll Hall of Fame in 1992 and the UK Music
+Hall of Fame in 2005. An English Heritage blue plaque was erected in
+his name on his former residence at Brook Street, London, in September
+1997. A star on the Hollywood Walk of Fame (at 6627 Hollywood Blvd.) was
+dedicated in 1994. In 2006, his debut US album, Are You Experienced,
+was inducted into the United States National Recording Registry, and
+Rolling Stone named Hendrix the top guitarist on its list of the 100
+greatest guitarists of all-time in 2003. He was also the first person
+inducted into the Native American Music Hall of Fame.

Added: incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt (added)
+++ incubator/stanbol/trunk/iks-autotagging/samples/russia_timezones.txt Thu Dec  2 11:30:36 2010
@@ -0,0 +1,25 @@
+The Russian government has decided to remove two of its eleven timezones,
+in the country's first step towards time reform, first started by
+president Dmitriy Medvedev in last November.
+
+The affected regions were Chukotka, the easternmost province of Russia
+were moved back an hour, as were and Samara and Udmurtia, which are now
+on Moscow time.  The changes were implemented on Saturday night, when
+most of the country was due to put their country ahead for summer time;
+however, affected areas instead didn't change their clocks at all.
+
+"It's possible that this could also aid the strengthening of Russia's
+position as a link in the global information infrastructure," Medvedev
+remarked earlier this month.  "Reducing of amount of time zones is very
+efficient for managing, for accordance of actions, for approximation of
+far regions to the center," commented Arkady Tishkov, who is a deputy
+science director of Geography Institution for the Russian Academy
+of Sciences. Tishkov speculated that the number of time zones could
+eventually be reduced to six.  Meanwhile, an online petition has been
+posted opposing the time change for the Samara province, and it has
+garnered close to 13,000 signatures. "Trips take place to many regions
+of the country and world where time, you understand, far from always
+corresponds with Moscow," the text of the petition read, adding that
+"In the winter, darkness will come almost at lunchtime, which isn't
+convenient and is psychologically quite hard."
+

Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/Autotagger.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,328 @@
+package eu.iksproject.autotagging;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.similar.MoreLikeThis;
+import org.apache.lucene.search.similar.MoreLikeThisQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.iksproject.autotagging.jena.ModelIndexer;
+
+/**
+ * Engine that uses a Lucene index of DBpedia entities (types and abstracts) to
+ * suggest the top 3 entities that are semantically related to the text content
+ * to annotate.
+ *
+ * @author ogrisel
+ */
+public class Autotagger {
+
+    private final Logger log = LoggerFactory.getLogger(getClass());
+
+    public String typeFieldName = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
+
+    private final String lookupFieldName = "http://www.w3.org/2000/01/rdf-schema#label";
+
+    private String[] likeFieldNames = {
+            "http://www.w3.org/2000/01/rdf-schema#label",
+            "http://dbpedia.org/property/abstract" };
+
+    private String idField = ModelIndexer.URI_FIELD;
+
+    private int maxSuggestions = 3;
+
+    private float lookupBoost = 2f;
+
+    private float contextBoost = 1f;
+
+    private Analyzer analyzer = getDefaultAnalyzer();
+
+    private String typePrefix = "http://dbpedia.org/ontology/";
+
+    private boolean strictLookup = true;
+
+    private final Directory directory;
+
+    public Autotagger(Directory directory) {
+        this.directory = directory;
+    }
+
+    public static Analyzer getDefaultAnalyzer() {
+        return new StandardAnalyzer(Version.LUCENE_30);
+    }
+
+    public Analyzer getAnalyzer(boolean withShingles) {
+        if (withShingles) {
+            return new ShingleAnalyzerWrapper(analyzer);
+        } else {
+            return analyzer;
+        }
+    }
+
+    public Autotagger withFieldNames(String[] fieldNames) {
+        this.likeFieldNames = fieldNames;
+        return this;
+    }
+
+    public Autotagger withIdFieldName(String idField) {
+        this.idField = idField;
+        return this;
+    }
+
+    public Autotagger withMaxSuggestions(int maxSuggestions) {
+        this.maxSuggestions = maxSuggestions;
+        return this;
+    }
+
+    public Autotagger withAnalyzer(Analyzer analyzer) {
+        this.analyzer = analyzer;
+        return this;
+    }
+
+    public Autotagger withLookupBoost(float lookupBoost) {
+        this.lookupBoost = lookupBoost;
+        return this;
+    }
+
+    public Autotagger withContextBoost(float contextBoost) {
+        this.contextBoost = contextBoost;
+        return this;
+    }
+
+    public Autotagger withTypePrefix(String typePrefix) {
+        this.typePrefix = typePrefix;
+        return this;
+    }
+
+    public Autotagger withStrictNameLookup(boolean strictLookup) {
+        this.strictLookup = strictLookup;
+        return this;
+    }
+
+    /**
+     * Suggest entities that are textually similar to the given text.
+     *
+     * @param text
+     * @return entities info that best match the text
+     * @throws CorruptIndexException
+     * @throws IOException
+     */
+    public List<TagInfo> suggest(String text) throws CorruptIndexException,
+            IOException {
+        return suggest(text, null);
+    }
+
+    /**
+     * Suggest entities that are textually similar to the given text. If the
+     * text is short enough, a fuzzy name lookup is performed instead. Further
+     * restrict the results to match the field values given in the fieldFilter
+     *
+     * @param text the textual content used for similarity search
+     * @param fieldFilters
+     * @return entities info that best match the text
+     * @throws CorruptIndexException
+     * @throws IOException
+     */
+    public List<TagInfo> suggest(String text,
+            Map<String, List<String>> fieldFilters)
+            throws CorruptIndexException, IOException {
+
+        // count tokens using the analyzer
+        TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
+        int tokens = 0;
+        while (tokenStream.incrementToken()) {
+            tokens++;
+        }
+        if (tokens > 3) {
+            // this is a context based suggestion
+            return suggest(null, text, fieldFilters);
+        } else {
+            // this is a name lookup
+            return suggest(text, null, fieldFilters);
+        }
+    }
+
+    /**
+     * Suggest entities that are fuzzy matching the given name and/or textually
+     * similar to the given context. Further restrict the results to match the
+     * field values given in the fieldFilter
+     *
+     * @param text the textual content used for similarity search
+     * @param fieldFilters
+     * @return entities info that best match the text
+     * @throws CorruptIndexException
+     * @throws IOException
+     */
+    public List<TagInfo> suggest(String name, String context,
+            Map<String, List<String>> fieldFilters)
+            throws CorruptIndexException, IOException {
+
+        if ((name == null || name.length() == 0)
+                && (context == null || context.length() == 0)) {
+            throw new IllegalArgumentException(
+                    "name and context value cannot be both null or empty");
+        }
+
+        List<TagInfo> suggestions = new ArrayList<TagInfo>(maxSuggestions);
+        IndexReader reader = IndexReader.open(directory, true);
+        IndexSearcher searcher = new IndexSearcher(reader);
+
+        BooleanQuery query = new BooleanQuery();
+        try {
+
+            // fuzzy name lookup
+            if (name != null) {
+                TokenStream ts = analyzer.tokenStream(lookupFieldName,
+                        new StringReader(name));
+                TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
+                while (ts.incrementToken()) {
+                    FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(
+                            lookupFieldName, termAtt.term()), 0.8f);
+                    // TODO: divide boost by number of terms
+                    fuzzyQuery.setBoost(lookupBoost);
+                    query.add(fuzzyQuery,
+                            strictLookup ? BooleanClause.Occur.MUST
+                                    : BooleanClause.Occur.SHOULD);
+                }
+            }
+
+            // similarity context search
+            if (context != null) {
+                // TODO: use FuzzyLikeThisQuery instead?
+                // TODO: re-enable shingles once we can get rid of the "-"
+                // shingles
+                MoreLikeThisQuery mltQuery = new MoreLikeThisQuery(context,
+                        likeFieldNames, getAnalyzer(false));
+                mltQuery.setPercentTermsToMatch(0.15f);
+                mltQuery.setMaxQueryTerms(20);
+                mltQuery.setMinTermFrequency(1);
+                mltQuery.setMinDocFreq(1);
+                mltQuery.setBoost(contextBoost);
+                query.add(mltQuery, BooleanClause.Occur.SHOULD);
+            }
+
+            // additional exact match filters
+            if (fieldFilters != null) {
+                for (Map.Entry<String, List<String>> fieldFilter : fieldFilters.entrySet()) {
+                    for (String value : fieldFilter.getValue()) {
+                        TermQuery tq = new TermQuery(new Term(
+                                fieldFilter.getKey(), value));
+                        // should not influence ranking, just filtering
+                        tq.setBoost(0.0f);
+                        query.add(tq, BooleanClause.Occur.MUST);
+                    }
+                }
+            }
+            TopDocs hits = searcher.search(query, maxSuggestions);
+            ScoreDoc[] scoreDocs = hits.scoreDocs;
+            for (int i = 0; i < Math.min(maxSuggestions, hits.totalHits); i++) {
+                double confidence = scoreDocs[i].score;
+                if (confidence == 0.0) {
+                    // this might happen with BooleanClause.Occur.SHOULD queries
+                    continue;
+                }
+                Document d = searcher.doc(scoreDocs[i].doc);
+                String id = d.get(idField);
+                log.debug(String.format("entity '%s' matches with score %f",
+                        id, confidence));
+
+                // assuming we are using DBPedia, we are extracting the label
+                // from the entity URI to avoid loading the lucene index with
+                // a stored label field
+                String label = URLDecoder.decode(id, "UTF-8");
+                label = label.substring(
+                        "http://dbpedia.org/resource/".length(), label.length());
+                label = label.replace("_", " ");
+                TagInfo tag = new TagInfo(id, label,
+                        d.getValues(typeFieldName), confidence);
+                suggestions.add(tag);
+            }
+        } finally {
+            reader.close();
+            searcher.close();
+        }
+        return suggestions;
+    }
+
+    /**
+     * Suggest entities that are fuzzy matching the given text (if short) or
+     * textually similar to the text (if long). Further restrict the results to
+     * match the type given either as full URI or DBpedia class name.
+     *
+     * @param name
+     * @param context
+     * @param type
+     * @return ranked entities info that best match
+     * @throws CorruptIndexException
+     * @throws IOException
+     */
+    public List<TagInfo> suggestForType(String text, String type)
+            throws CorruptIndexException, IOException {
+        Map<String, List<String>> fieldFilters = new HashMap<String, List<String>>();
+        if (!type.startsWith("http://")) {
+            type = typePrefix + type;
+        }
+        fieldFilters.put(typeFieldName, Arrays.asList(type));
+        return suggest(text, fieldFilters);
+    }
+
+    /**
+     * Suggest entities that are fuzzy matching the given name and/or textually
+     * similar to the given context. Further restrict the results to match the
+     * type given either as full URI or DBpedia class name.
+     *
+     * @param name
+     * @param context
+     * @param type
+     * @return ranked entities info that best match
+     * @throws CorruptIndexException
+     * @throws IOException
+     */
+    public List<TagInfo> suggestForType(String name, String context, String type)
+            throws CorruptIndexException, IOException {
+        Map<String, List<String>> fieldFilters = new HashMap<String, List<String>>();
+        if (type != null) {
+            if (!type.startsWith("http://")) {
+                type = typePrefix + type;
+            }
+            fieldFilters.put(typeFieldName, Arrays.asList(type));
+        }
+        return suggest(name, context, fieldFilters);
+    }
+
+
+    public String[] mostImportantTerms(String text) throws CorruptIndexException, IOException {
+        IndexReader reader = IndexReader.open(directory, true);
+        MoreLikeThis mlt = new MoreLikeThis(reader);
+        mlt.setFieldNames(likeFieldNames);
+        mlt.setAnalyzer(analyzer);
+        mlt.setMaxQueryTerms(maxSuggestions);
+        return mlt.retrieveInterestingTerms(new StringReader(text));
+    }
+}
\ No newline at end of file

Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/TagInfo.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,81 @@
+package eu.iksproject.autotagging;
+
+
+/**
+ * Simple data transfer object to hold the results of the Autotagger annotation
+ * process. This then can be mapped to a very simple RDF graph to publish the
+ * results annotations to third party applications.
+ *
+ * @author ogrisel
+ */
+public class TagInfo {
+
+    /**
+     * Unique ID of the entity that is related to the text content. This is
+     * typically the DBpedia unique URI of the entity.
+     */
+    private final String id;
+
+    /**
+     * Human readable label (or name) of the related entity.
+     */
+    private final String label;
+
+    /**
+     * Measure of the estimated quality of the suggestion, the bigger, the
+     * better. The actual range of values is data and implementation specific.
+     */
+    private final Double confidence;
+
+    /**
+     * List of types of the related entity. This typically a list of owl:Class
+     * from the DBpedia ontology (e.g. 'http://dbpedia.org/ontology/Person').
+     */
+    private final String[] type;
+
+    public TagInfo(String id, String label, String[] type, double confidence) {
+    	if(id == null){
+    		throw new IllegalArgumentException("Parameter id MUST NOT be NULL");
+    	}
+        this.id = id;
+        this.label = label;
+        this.type = type;
+        this.confidence = confidence;
+    }
+
+    @Override
+    public String toString() {
+        return String.format("%s [%f]", label, confidence);
+    }
+    /**
+     * Checks for != null, instanceof TagInfor and equals id
+     */
+    @Override
+    public boolean equals(Object obj) {
+    	return obj != null && obj instanceof TagInfo && ((TagInfo)obj).id.equals(id) && ((TagInfo)obj).confidence.equals(confidence);
+    }
+    public final String getId() {
+		return id;
+	}
+
+	public final String getLabel() {
+		return label;
+	}
+
+	public final Double getConfidence() {
+		return confidence;
+	}
+
+	public final String[] getType() {
+		return type;
+	}
+
+	/**
+     * Implementation based on the id and confidence property
+     */
+    @Override
+    public int hashCode() {
+    	return id.hashCode()+confidence.hashCode();
+    }
+
+}

Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/cli/CommandLineRunner.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,241 @@
+package eu.iksproject.autotagging.cli;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.tdb.TDBFactory;
+
+import eu.iksproject.autotagging.Autotagger;
+import eu.iksproject.autotagging.TagInfo;
+import eu.iksproject.autotagging.jena.ModelIndexer;
+import eu.iksproject.autotagging.jena.ModelResampler;
+
+/**
+ * Command line User Interface for importing RDF data into Jena models from
+ * dumps, sampling the relevant part and indexing the results with Lucene.
+ *
+ * @author ogrisel
+ */
+public class CommandLineRunner {
+
+    public static Options makeCommonOptions() {
+        Options options = new Options();
+        options.addOption("h", "help", false, "display this help and exit");
+        options.addOption("d", "debug", false,
+                "show debug stacktrace upon error");
+        return options;
+    }
+
+    public static void handleModel(String[] args) throws ParseException,
+            IOException {
+        CommandLineParser parser = new PosixParser();
+        Options options = makeCommonOptions();
+        CommandLine line = parser.parse(options, args);
+        args = line.getArgs();
+
+        if (args.length < 2 || line.hasOption("h")) {
+            HelpFormatter formatter = new HelpFormatter();
+            formatter.printHelp(
+                    "model /path/to/tdb-model file.nt [file2.n3.gz file3.xml.bz2 ...]",
+                    options);
+            System.exit(0);
+        }
+        String modelPath = args[1];
+        Model model = TDBFactory.createModel(modelPath);
+        for (String filename : Arrays.asList(args).subList(2, args.length)) {
+            System.out.printf("loading '%s' into model '%s'...", filename,
+                    modelPath);
+            InputStream is = new FileInputStream(filename);
+
+            if (filename.endsWith(".gz")) {
+                is = new GZIPInputStream(is);
+                filename = filename.replaceFirst("\\.gz$", "");
+            } else if (filename.endsWith(".bz2")) {
+                is = new BZip2CompressorInputStream(is);
+                filename = filename.replaceFirst("\\.bz2$", "");
+            }
+
+            String format = null;
+            if (filename.endsWith(".nt")) {
+                format = "N-TRIPLE";
+            } else if (filename.endsWith(".n3")) {
+                format = "N3";
+            } // XML is the default format
+
+            model.read(is, null, format);
+            System.out.println(" done");
+        }
+    }
+
+    public static void handleResample(String[] args) throws ParseException {
+        CommandLineParser parser = new PosixParser();
+        Options options = makeCommonOptions();
+        Option maxTopResourcesOpt = new Option("t", "max-top-resources", true,
+                "maximum number of resources to sample");
+        maxTopResourcesOpt.setType(Integer.class);
+        options.addOption(maxTopResourcesOpt);
+        Option scoreFileOpt = new Option("s", "score-file", true,
+                "use TSV file holding ranked and scored resources");
+        options.addOption(scoreFileOpt);
+        CommandLine line = parser.parse(options, args);
+        boolean debug = line.hasOption("d");
+        args = line.getArgs();
+        if (args.length != 2 || line.hasOption("h")) {
+            HelpFormatter formatter = new HelpFormatter();
+            formatter.printHelp(
+                    "resample /path/to/src-tdb-model /path/to/sampled-tdb-model",
+                    options);
+            System.exit(0);
+        }
+        try {
+            int maxTopResources = Integer.parseInt(line.getOptionValue("t",
+                    "10000"));
+            String scores = line.getOptionValue("s");
+            ModelResampler.resample(new File(args[0]), new File(args[1]),
+                    new File(scores), maxTopResources);
+        } catch (Exception e) {
+            System.err.println(String.format("ERROR: %s - %s",
+                    e.getClass().getSimpleName(), e.getMessage()));
+            if (debug) {
+                e.printStackTrace();
+            }
+            System.exit(5);
+        }
+    }
+
+    public static void handleIndex(String[] args) throws ParseException {
+        CommandLineParser parser = new PosixParser();
+        Options options = makeCommonOptions();
+        CommandLine line = parser.parse(options, args);
+        args = line.getArgs();
+        if (args.length < 2 || line.hasOption("h")) {
+            HelpFormatter formatter = new HelpFormatter();
+            formatter.printHelp(
+                    "index /path/to/tdb-model /path/to/lucene-index", options);
+            System.exit(0);
+        }
+        try {
+            ModelIndexer.index(new File(args[0]), new File(args[1]));
+        } catch (Exception e) {
+            System.err.println("ERROR: " + e.getMessage());
+            System.exit(4);
+        }
+    }
+
+    public static void handleSuggest(String[] args) throws IOException,
+            ParseException {
+        CommandLineParser parser = new PosixParser();
+        Options options = makeCommonOptions();
+
+        options.addOption("i", "index", true,
+                "path to a specific lucene directory");
+
+        options.addOption("n", "name", true,
+                "restrict suggestions to lookup entities matching the provided name");
+
+        options.addOption("c", "context", true,
+                "restrict suggestions to entities similar to the provided context");
+
+        options.addOption("f", "context-file", true,
+                "restrict suggestions to entities similar to the provided utf-8 text file");
+
+        options.addOption("t", "type", true,
+                "restrict suggestions to entities of given type");
+
+        Option maxSuggestionsOpt = new Option("s", "max-suggestions", true,
+                "maximum number of suggestions");
+        maxSuggestionsOpt.setType(Integer.class);
+        options.addOption(maxSuggestionsOpt);
+
+        CommandLine line = parser.parse(options, args);
+        args = line.getArgs();
+        String name = line.getOptionValue("n");
+        String context = line.getOptionValue("c", "");
+        String contextFile = line.getOptionValue("f");
+
+        if (line.hasOption("h")
+                || (name == null && context == null && contextFile == null)) {
+            HelpFormatter formatter = new HelpFormatter();
+            formatter.printHelp(
+                    "suggest --name \"John Smith\" --context-file smith-biography.txt ",
+                    options);
+            System.exit(0);
+        }
+
+        String customIndex = line.getOptionValue("i");
+        Directory dir;
+        if (customIndex != null) {
+            dir = FSDirectory.open(new File(customIndex));
+        } else {
+            dir = FSDirectory.open(ModelIndexer.buildDefaultIndex());
+        }
+
+        int maxSuggestions = Integer.parseInt(line.getOptionValue("s", "3"));
+        Autotagger tagger = new Autotagger(dir).withMaxSuggestions(maxSuggestions);
+
+        if (contextFile != null) {
+            context += " ";
+            context = IOUtils.toString(new FileInputStream(
+                    new File(contextFile)));
+        }
+        String type = line.getOptionValue("t");
+
+        System.out.printf("Computing suggestions...");
+        long startTime = System.currentTimeMillis();
+        List<TagInfo> suggestions = tagger.suggestForType(name, context, type);
+        System.out.printf(" done in %dms:\n",
+                (System.currentTimeMillis() - startTime));
+
+        for (int i = 0; i < suggestions.size(); i++) {
+            TagInfo tag = suggestions.get(i);
+            System.out.printf("Suggestion #%d (score: %f): '%s'\n", i + 1,
+                    tag.getConfidence(), tag.getLabel());
+            System.out.printf("URI:\t%s\n", tag.getId());
+            for (String tagType : tag.getType()) {
+                System.out.printf("type:\t%s\n", tagType);
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException, ParseException {
+        if (args.length < 1) {
+            System.out.println("expected command: model, resample, index or suggest");
+            System.exit(1);
+        }
+
+        String command = args[0];
+        String[] commandArgs = Arrays.copyOfRange(args, 1, args.length);
+
+        if (command.equals("model")) {
+            handleModel(commandArgs);
+        } else if (command.equals("resample")) {
+            handleResample(commandArgs);
+        } else if (command.equals("index")) {
+            handleIndex(commandArgs);
+        } else if (command.equals("suggest")) {
+            handleSuggest(commandArgs);
+        } else {
+            System.err.append("unknow command: " + args[0]);
+            System.exit(5);
+        }
+    }
+
+}

Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelIndexer.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,283 @@
+package eu.iksproject.autotagging.jena;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.query.QuerySolution;
+import com.hp.hpl.jena.query.ResultSet;
+import com.hp.hpl.jena.rdf.model.Literal;
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.rdf.model.Property;
+import com.hp.hpl.jena.rdf.model.Resource;
+import com.hp.hpl.jena.rdf.model.Statement;
+import com.hp.hpl.jena.rdf.model.StmtIterator;
+import com.hp.hpl.jena.tdb.TDBFactory;
+
+import eu.iksproject.autotagging.Autotagger;
+
+/**
+ * Build a Lucene index out of a Jena model
+ *
+ * @author ogrisel
+ *
+ */
+public class ModelIndexer implements Closeable {
+
+    private static final Logger log = LoggerFactory.getLogger(ModelIndexer.class);
+
+    public static final String URI_FIELD = "uri";
+
+    public static final String DEFAULT_DBPEDIA_SAMPLE = "dbpedia/dbpedia-sample-10000.nt";
+
+    public static final String POPULARITY_SCORE_PROPERTY = "http://www.iksproject.eu/ns/popularity-score";
+
+    private final IndexWriter iwriter;
+
+    private final Model model;
+
+    // reduce GC load by reusing Document and Fields instances
+    private final Map<String, Field> literalFields = new HashMap<String, Field>();
+
+    private final Map<String, Map<String, Field>> uriFields = new HashMap<String, Map<String, Field>>();
+
+    private final Document doc = new Document();
+
+    private final Map<String, Float> boostedFields = new HashMap<String, Float>();
+
+    private final String scorePropertyUri = POPULARITY_SCORE_PROPERTY;
+
+    private Field getField(String property, String value, boolean isLiteral) {
+        // make the cache key
+        if (isLiteral) {
+            Field cachedField = literalFields.get(property);
+            if (cachedField == null) {
+                cachedField = new Field(property, value, Field.Store.NO,
+                        Field.Index.ANALYZED);
+                literalFields.put(property, cachedField);
+            } else {
+                cachedField.setValue(value);
+            }
+            return cachedField;
+        } else {
+            // TODO: make sure that the multivalued URI properties take value in
+            // a limit size controlled vocabulary which is the case for types,
+            // but not for relations between entities
+            Map<String, Field> cachedFields = uriFields.get(property);
+            if (cachedFields == null) {
+                cachedFields = new HashMap<String, Field>();
+                uriFields.put(property, cachedFields);
+            }
+            Field cachedField = cachedFields.get(value);
+            if (cachedField == null) {
+                cachedField = new Field(property, value, Field.Store.YES,
+                        Field.Index.NOT_ANALYZED);
+                cachedFields.put(value, cachedField);
+            }
+            return cachedField;
+        }
+    }
+
+    public ModelIndexer(final IndexWriter iwriter, final Model model) {
+        this.iwriter = iwriter;
+        this.model = model;
+
+        // by default boost the title (a.k.a. rdfs:label of the entity)
+        boostedFields.put("http://www.w3.org/2000/01/rdf-schema#label", 3.0f);
+    }
+
+    public Map<String, Float> getBoostedFields() {
+        return boostedFields;
+    }
+
+    public void close() throws IOException {
+        iwriter.close();
+        model.close();
+    }
+
+    public Iterator<Document> indexIterator() {
+        ModelResampler sampler = new ModelResampler();
+        final ResultSet resultSet = sampler.queryAllResources(model);
+        final Property scoreProperty = model.getProperty(scorePropertyUri);
+
+        return new Iterator<Document>() {
+
+            public boolean hasNext() {
+                return resultSet.hasNext();
+            }
+
+            public Document next() {
+                QuerySolution solution = resultSet.next();
+                Resource r = solution.getResource("resource");
+                StmtIterator stmts = model.listStatements(r, null, null, null);
+                doc.getFields().clear();
+                doc.add(getField(URI_FIELD, r.getURI(), false));
+                List<Statement> stmtList = stmts.toList();
+
+                // find document boost info if any
+                float docBoost = 1.0f;
+                Statement toDelete = null;
+                for (Statement stmt : stmtList) {
+                    if (stmt.getPredicate().equals(scoreProperty)) {
+                        docBoost = stmt.getFloat();
+                        toDelete = stmt;
+                    }
+                }
+                if (toDelete != null) {
+                    stmtList.remove(toDelete);
+                }
+
+                // index all statement objects as lucene fields
+                for (Statement stmt : stmtList) {
+                    String text;
+                    boolean isLiteral = stmt.getObject().isLiteral();
+                    if (isLiteral) {
+                        text = stmt.getObject().as(Literal.class).getString();
+                    } else if (stmt.getObject().isURIResource()) {
+                        text = stmt.getObject().as(Resource.class).getURI();
+                    } else {
+                        // skip non indexable nodes (blank nodes, seqs, bags,
+                        // ...)
+                        continue;
+                    }
+                    Field field = getField(stmt.getPredicate().toString(),
+                            text, isLiteral);
+                    Float boost = boostedFields.get(field.name());
+                    if (boost != null) {
+                        field.setBoost(boost * docBoost);
+                    } else {
+                        field.setBoost(docBoost);
+                    }
+                    doc.add(field);
+                }
+                try {
+                    iwriter.addDocument(doc);
+                } catch (Exception e) {
+                    log.error("error indexing " + r.getURI(), e);
+                    return doc;
+                }
+                return doc;
+
+            }
+
+            public void remove() {
+                throw new NotImplementedException();
+            }
+        };
+    }
+
+    public static void index(Model model, IndexWriter writer, boolean close)
+            throws IOException {
+        ModelIndexer indexer = new ModelIndexer(writer, model);
+        try {
+            log.info("computing the list of entities to process...");
+            long lastTime = System.currentTimeMillis();
+            Iterator<Document> iterator = indexer.indexIterator();
+            long newTime = System.currentTimeMillis();
+            log.info(String.format("query took %fs",
+                    (newTime - lastTime) / 1000.));
+            lastTime = newTime;
+            int i = 1;
+            long checkpointSize = 5000;
+            while (iterator.hasNext()) {
+                Document doc = iterator.next();
+                if (i % checkpointSize == 0) {
+                    writer.commit();
+                    newTime = System.currentTimeMillis();
+                    double duration = (newTime - lastTime) / 1000.;
+                    log.info(String.format(
+                            "indexed entity %09d '%s' at %f entities/s", i,
+                            URLDecoder.decode(doc.get(URI_FIELD), "UTF-8"),
+                            checkpointSize / duration));
+                    lastTime = newTime;
+                }
+                i++;
+            }
+            writer.commit();
+            log.info(String.format(
+                    "successfully indexed %09d entities, now optimizing the index",
+                    i));
+            writer.optimize();
+        } finally {
+            if (close) {
+                indexer.close();
+            }
+        }
+    }
+
+    public static void index(File tdbModel, File fsDirectory)
+            throws CorruptIndexException, LockObtainFailedException,
+            IOException {
+        Model model = TDBFactory.createModel(tdbModel.getAbsolutePath());
+        index(model, fsDirectory);
+    }
+
+    public static void index(Model model, File fsDirectory)
+            throws CorruptIndexException, LockObtainFailedException,
+            IOException {
+        MaxFieldLength maxFieldLength = new MaxFieldLength(100000);
+        // TODO: re-enable shingles once we can get rid of the "-"
+        // shingles
+        IndexWriter writer = new IndexWriter(FSDirectory.open(fsDirectory),
+                Autotagger.getDefaultAnalyzer(), true, maxFieldLength);
+        writer.setRAMBufferSizeMB(42);
+        index(model, writer, true);
+    }
+
+    public static String DEFAULT_INDEX_DIRECTORY() {
+        return "default-iks-autotagging-idx";
+    }
+
+    public static File buildDefaultIndex() throws CorruptIndexException,
+            LockObtainFailedException, IOException {
+        return ModelIndexer.buildDefaultIndex(null, false);
+    }
+
+    public static File buildDefaultIndex(File folder, boolean deleteExisting)
+            throws CorruptIndexException, LockObtainFailedException,
+            IOException {
+        if (folder == null) {
+            folder = new File(System.getProperty("java.io.tmpdir"));
+        }
+        File fsDirectory = new File(folder, DEFAULT_INDEX_DIRECTORY());
+        if (deleteExisting) {
+            log.info("deleting default indexed model in: "
+                    + fsDirectory.getAbsolutePath());
+            FileUtils.deleteDirectory(fsDirectory);
+        }
+        if (!fsDirectory.exists()) {
+            log.info("creating default indexed model in: "
+                    + fsDirectory.getAbsolutePath());
+            InputStream stream = ModelIndexer.class.getClassLoader().getResourceAsStream(
+                    DEFAULT_DBPEDIA_SAMPLE);
+            if (stream == null) {
+                throw new IOException("could not find resource: "
+                        + DEFAULT_DBPEDIA_SAMPLE);
+            }
+            Model model = ModelFactory.createDefaultModel();
+            model.read(stream, null, "N-TRIPLE");
+            ModelIndexer.index(model, fsDirectory);
+        }
+        return fsDirectory;
+    }
+
+}

Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ModelResampler.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,325 @@
+package eu.iksproject.autotagging.jena;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.Iterator;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.NotImplementedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.query.Query;
+import com.hp.hpl.jena.query.QueryExecution;
+import com.hp.hpl.jena.query.QueryExecutionFactory;
+import com.hp.hpl.jena.query.QueryFactory;
+import com.hp.hpl.jena.query.QuerySolution;
+import com.hp.hpl.jena.query.QuerySolutionMap;
+import com.hp.hpl.jena.query.ResultSet;
+import com.hp.hpl.jena.query.Syntax;
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.rdf.model.Resource;
+import com.hp.hpl.jena.rdf.model.StmtIterator;
+import com.hp.hpl.jena.tdb.TDBFactory;
+
+/**
+ * Read a Jena model and extract the most popular resources given by a
+ * "tab separated values" files that holds the rank information of the
+ * resources.
+ *
+ * For instance such as file can be computed from the page links info of DBpedia
+ * using the corpusmaker toolkit: http://github.com/ogrisel/corpusmaker
+ *
+ * As it takes from 1h to 3h to compute such statistics, a precomputed file is
+ * available here:
+ *
+ * http://dl.dropbox.com/u/5743203/IKS/autotagging/incoming-counts-redirected.
+ * tsv.gz
+ *
+ * @author ogrisel
+ */
+public class ModelResampler {
+
+    private final Logger log = LoggerFactory.getLogger(getClass());
+
+    protected int maxTopResources = 10000;
+
+    protected File tsvScoreFile;
+
+    public ModelResampler() {
+        // use default values
+    }
+
+    public ModelResampler withMaxTopResources(int maxTopResources) {
+        this.maxTopResources = maxTopResources;
+        return this;
+    }
+
+    public ModelResampler withPrecomputedScoresFile(File tsvRanksFile) {
+        this.tsvScoreFile = tsvRanksFile;
+        return this;
+    }
+
+    /**
+     * Perform a query that returns a result set iterating over all typed
+     * resource. The ordering of the results is undefined.
+     *
+     * @param model the model to query
+     * @return a result set where 'resource' is bound to the a resource
+     */
+    public ResultSet queryAllResources(Model model) {
+        QuerySolution mapping = new QuerySolutionMap();
+        StringBuilder qb = new StringBuilder();
+        qb.append("SELECT distinct ?resource   ");
+        qb.append("{ ");
+        qb.append(" ?resource a ?type . ");
+        qb.append(" FILTER ( isURI(?resource) ) . ");
+        qb.append("} ");
+        Query q = QueryFactory.create(qb.toString(), Syntax.syntaxARQ);
+        QueryExecution qexec = QueryExecutionFactory.create(q, model, mapping);
+        return qexec.execSelect();
+    }
+
+    /**
+     * Perform a query to find the top popular resources by counting incoming
+     * links. The score values are normalized (the most popular resource as a
+     * score of 1.0, unless all scores are 0.0).
+     *
+     * @param model the model to query
+     * @return a result set where 'resource' is bound to a popular resource
+     */
+    public Iterator<ResourceInfo> queryTopResources(Model model) {
+        QuerySolution mapping = new QuerySolutionMap();
+        StringBuilder qb = new StringBuilder();
+        qb.append("SELECT ?resource ( count(?incoming) AS ?count )  ");
+        qb.append("{ ");
+        qb.append(" ?resource a ?type . ");
+        qb.append(" OPTIONAL { ?incoming ?relationship ?resource . } . ");
+        qb.append(" FILTER ( isURI(?resource) ) . ");
+        qb.append("} ");
+        qb.append("GROUP BY ?resource ");
+        qb.append("ORDER BY DESC ( ?count ) ");
+        qb.append(String.format("OFFSET 0 LIMIT %d", maxTopResources));
+        Query q = QueryFactory.create(qb.toString(), Syntax.syntaxARQ);
+        final ResultSet resultSet = QueryExecutionFactory.create(q, model,
+                mapping).execSelect();
+        return new Iterator<ResourceInfo>() {
+
+            long sampled = 0;
+
+            double maxScore = 1.0;
+
+            public boolean hasNext() {
+                return resultSet.hasNext();
+            }
+
+            public ResourceInfo next() {
+                QuerySolution nextSolution = resultSet.nextSolution();
+                double count = nextSolution.getLiteral("count").getDouble();
+                double score = Math.log1p(count);
+                if (sampled == 0 && count > 0) {
+                    maxScore = score;
+                }
+                sampled++;
+                return new ResourceInfo(nextSolution.getResource("resource"),
+                        score / maxScore);
+            }
+
+            public void remove() {
+                throw new UnsupportedOperationException();
+            }
+        };
+    }
+
+    @SuppressWarnings("unchecked")
+    public Iterator<ResourceInfo> findTopResources(final Model model)
+            throws FileNotFoundException, IOException {
+        if (tsvScoreFile == null) {
+            return queryTopResources(model);
+        }
+        final Iterator<String> lines = IOUtils.lineIterator(
+                new FileInputStream(tsvScoreFile), "utf-8");
+        return new Iterator<ResourceInfo>() {
+
+            double maxScore = 1.0;
+
+            int sampled = 0;
+
+            ResourceInfo nextRi = null;
+
+            protected ResourceInfo fetchNext(boolean andForget) {
+                ResourceInfo result = nextRi;
+                if (result == null) {
+                    if (lines.hasNext()) {
+                        String line = lines.next();
+                        String[] parts = line.split("\t");
+                        if (parts.length != 2) {
+                            log.warn(String.format("skipping line: '%s'", line));
+                            return fetchNext(andForget);
+                        }
+                        double score = Double.parseDouble(parts[1].trim());
+                        // take the log to avoid over popular entities to
+                        // dominate the results (attenuate the Zipf law of
+                        // culturally generated distribution)
+                        score = Math.log1p(score);
+                        if (sampled == 0 && score > 0) {
+                            maxScore = score;
+                        }
+                        String resource = parts[0].trim();
+                        if (!resource.startsWith("http://")) {
+                            resource = "http://dbpedia.org/resource/"
+                                    + resource;
+                        }
+                        Resource r = model.createResource(resource);
+                        if (!model.containsResource(r)) {
+                            log.debug(String.format(
+                                    "skipping resource: '%s', not found in model",
+                                    resource));
+                            return fetchNext(andForget);
+                        }
+                        result = new ResourceInfo(r, score / maxScore);
+                    }
+                }
+                nextRi = andForget ? null : result;
+                return result;
+            }
+
+            public boolean hasNext() {
+                return sampled < maxTopResources && fetchNext(false) != null;
+            }
+
+            public ResourceInfo next() {
+                ResourceInfo next = fetchNext(true);
+                sampled++;
+                return next;
+            }
+
+            public void remove() {
+                throw new UnsupportedOperationException();
+            }
+        };
+    }
+
+    /**
+     * Iteratively sample statements carried by popular resources of sourceModel
+     * into targetModel.
+     *
+     * @param sourceModel model to sample popular resources from
+     * @param targetModel model to save resource attributes to
+     * @return an iterator over popular resource to monitor progress
+     * @throws IOException
+     * @throws FileNotFoundException
+     */
+    public Iterator<ResourceInfo> samplerIterator(final Model sourceModel,
+            final Model targetModel) throws FileNotFoundException, IOException {
+        final Iterator<ResourceInfo> topResources = findTopResources(sourceModel);
+        return new Iterator<ResourceInfo>() {
+
+            public boolean hasNext() {
+                return topResources.hasNext();
+            }
+
+            public ResourceInfo next() {
+                ResourceInfo ri = topResources.next();
+                StmtIterator stmts = sourceModel.listStatements(ri.resource,
+                        null, null, null);
+                targetModel.add(stmts);
+                targetModel.add(targetModel.createLiteralStatement(
+                        ri.resource,
+                        targetModel.getProperty(ModelIndexer.POPULARITY_SCORE_PROPERTY),
+                        ri.score));
+                return ri;
+            }
+
+            public void remove() {
+                throw new NotImplementedException();
+            }
+        };
+    }
+
+    /**
+     * Extract the most popular resources ranked by incoming relation into s *
+     * targetModel.
+     *
+     * @param sourceModel model to extract popular resource from
+     * @param targetModel model where to save the extracted resources data
+     *
+     * @throws IOException
+     * @throws FileNotFoundException
+     */
+    public void extractMostPopular(Model sourceModel, Model targetModel)
+            throws FileNotFoundException, IOException {
+        log.info("computing the list of resources to sample...");
+        long lastTime = System.currentTimeMillis();
+        Iterator<ResourceInfo> iterator = samplerIterator(sourceModel,
+                targetModel);
+        long newTime = System.currentTimeMillis();
+        log.info(String.format("query took %fs", (newTime - lastTime) / 1000.));
+        lastTime = newTime;
+        int i = 1;
+        long checkpointSize = 5000;
+        while (iterator.hasNext()) {
+            ResourceInfo ri = iterator.next();
+            if (i == 1 && ri.score == 0.0f) {
+                log.warn(String.format(
+                        "most popular resource '%s' has a score of 0.0...",
+                        ri.resource.getURI()));
+            }
+            if (i % checkpointSize == 0) {
+                newTime = System.currentTimeMillis();
+                double duration = (newTime - lastTime) / 1000.;
+                String uri = ri.resource.getURI();
+                try {
+                    log.info(String.format(
+                            "sampled resource %09d (at '%s' with score %f) - %f entities/s",
+                            i, URLDecoder.decode(uri, "UTF-8"), ri.score,
+                            checkpointSize / duration));
+                } catch (UnsupportedEncodingException e) {
+                    log.warn(String.format("invalid URI '%s': %s", uri,
+                            e.getMessage()));
+                }
+                lastTime = newTime;
+            }
+            i++;
+        }
+        log.info(String.format("successfully sampled %09d resources", i));
+    }
+
+    public static void resample(File srcTdbFolder, File targetFile,
+            File scoresFile, int maxTopResources) throws IOException {
+        Model sourceModel = TDBFactory.createModel(srcTdbFolder.getAbsolutePath());
+
+        String filename = targetFile.getName();
+        String format = null;
+        boolean useTemporaryModel = false;
+        if (filename.endsWith(".nt")) {
+            format = "N-TRIPLE";
+            useTemporaryModel = true;
+        } else if (filename.endsWith(".n3")) {
+            format = "N3";
+            useTemporaryModel = true;
+        } else if (filename.endsWith(".xml")) {
+            // format = null will use the XML syntax
+            useTemporaryModel = true;
+        }
+
+        // TODO: use a temporary TDB model in a temporary directory instead of
+        // a memory model that lacks scalability?
+        Model targetModel = useTemporaryModel ? ModelFactory.createDefaultModel()
+                : TDBFactory.createModel(targetFile.getAbsolutePath());
+
+        ModelResampler sampler = new ModelResampler().withMaxTopResources(
+                maxTopResources).withPrecomputedScoresFile(scoresFile);
+        sampler.extractMostPopular(sourceModel, targetModel);
+
+        if (useTemporaryModel) {
+            targetModel.write(new FileOutputStream(targetFile), format, null);
+        }
+    }
+}

Added: incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/java/eu/iksproject/autotagging/jena/ResourceInfo.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,16 @@
+package eu.iksproject.autotagging.jena;
+
+import com.hp.hpl.jena.rdf.model.Resource;
+
+public class ResourceInfo {
+
+    public final Resource resource;
+
+    public final Double score;
+
+
+    public ResourceInfo(Resource resource, Double score) {
+        this.resource = resource;
+        this.score = score;
+    }
+}

Added: incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/main/resources/META-INF/MANIFEST.MF Thu Dec  2 11:30:36 2010
@@ -0,0 +1,5 @@
+Manifest-Version: 1.0
+Bundle-ManifestVersion: 2
+Bundle-Name: IKS Autotagging
+Bundle-SymbolicName: eu.iksproject.autotagging
+

Added: incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/AutotaggingTest.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,194 @@
+package eu.iksproject.autotagging;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+
+import eu.iksproject.autotagging.jena.ModelIndexer;
+
+public class AutotaggingTest {
+
+    private Model model;
+
+    private RAMDirectory ramDirectory;
+
+    private StandardAnalyzer analyzer;
+
+    private IndexWriter writer;
+
+    protected File defaultIndexDirectory;
+
+    public static InputStream getResource(String name) {
+        InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(
+                name);
+        assertNotNull("failed to load resource " + name, stream);
+        return stream;
+    }
+
+    @BeforeClass
+    public static void setUpDefaultIndex() throws Exception {
+        // create index from scratch
+        ModelIndexer.buildDefaultIndex(null, true);
+    }
+
+    @Before
+    public void setUp() throws CorruptIndexException,
+            LockObtainFailedException, IOException {
+        model = ModelFactory.createDefaultModel();
+        ramDirectory = new RAMDirectory();
+        analyzer = new StandardAnalyzer(Version.LUCENE_30);
+        writer = new IndexWriter(ramDirectory, analyzer, true,
+                new IndexWriter.MaxFieldLength(25000));
+        model.read(getResource("dbpedia_3.4_instancetype_en.nt"), null,
+                "N-TRIPLE");
+        model.read(getResource("dbpedia_3.4_longabstract_en.nt"), null,
+                "N-TRIPLE");
+        // will reuse the index built by setUpDefaultIndex
+        defaultIndexDirectory = ModelIndexer.buildDefaultIndex();
+    }
+
+    @Test
+    public void testIndexing() throws IOException, ParseException {
+        // index model without closing it since it is memory only
+        ModelIndexer.index(model, writer, false);
+        writer.close();
+
+        // perform a query on the fulltext content of the abstracts in the model
+        IndexSearcher isearcher = new IndexSearcher(ramDirectory, true); // read-only=true
+        QueryParser parser = new QueryParser(Version.LUCENE_30,
+                "http://dbpedia.org/property/abstract", analyzer);
+        Query query = parser.parse("1981");
+        ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
+        assertEquals(1, hits.length);
+
+        // check that the match point to the expected entity
+        Document hitDoc = isearcher.doc(hits[0].doc);
+        assertEquals("http://dbpedia.org/resource/%21Action_Pact%21",
+                hitDoc.get(ModelIndexer.URI_FIELD));
+        String[] types = hitDoc.getValues("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
+        assertEquals(3, types.length);
+        isearcher.close();
+    }
+
+    @Test
+    public void testAutotaggingWithCustomIndex() throws IOException, ParseException {
+        // index the model
+        testIndexing();
+
+        // perform a suggestion query
+        Autotagger autotagger = new Autotagger(ramDirectory);
+        List<TagInfo> tags = autotagger.suggest("The punk side in me is telling me to listen to the british band Action Pact.");
+        assertTrue(!tags.isEmpty());
+
+        assertEquals("http://dbpedia.org/resource/%21Action_Pact%21",
+                tags.get(0).getId());
+        assertEquals("!Action Pact!", tags.get(0).getLabel());
+        assertEquals(3, tags.get(0).getType().length);
+        assertEquals(0.59, tags.get(0).getConfidence(), 0.1f);
+        assertEquals("http://dbpedia.org/ontology/Band", tags.get(0).getType()[0]);
+        assertEquals("http://dbpedia.org/ontology/Organisation",
+                tags.get(0).getType()[1]);
+        assertEquals("http://www.w3.org/2002/07/owl#Thing", tags.get(0).getType()[2]);
+    }
+
+    @Test
+    public void testAutotaggingWithDefaultIndex() throws IOException,
+            ParseException {
+
+        // build a tagger using a the default DBpedia based index
+        Directory dir = FSDirectory.open(defaultIndexDirectory);
+        Autotagger autotagger = new Autotagger(dir);
+
+        // perform a context similarity search for a Person
+        String context = "Let the autotagger guess who was a Jamaican"
+                + " musician, a lead singer and guitarist"
+                + " for a well known reggae band.";
+
+        List<TagInfo> tags = autotagger.suggestForType(context, "Person");
+        assertTrue(!tags.isEmpty());
+        TagInfo bestGuess = tags.get(0);
+
+        assertEquals("http://dbpedia.org/resource/Bob_Marley", bestGuess.getId());
+        assertEquals("Bob Marley", bestGuess.getLabel());
+
+        List<String> types = Arrays.asList(bestGuess.getType());
+        assertEquals(4, types.size());
+        assertTrue(types.contains("http://www.w3.org/2002/07/owl#Thing"));
+        assertTrue(types.contains("http://dbpedia.org/ontology/Person"));
+        assertTrue(types.contains("http://dbpedia.org/ontology/Artist"));
+        assertTrue(types.contains("http://dbpedia.org/ontology/MusicalArtist"));
+    }
+
+    @Test
+    public void testEntityByNameWithContext() throws IOException {
+
+        // build a tagger using a the default DBpedia based index
+        Directory dir = FSDirectory.open(defaultIndexDirectory);
+        Autotagger autotagger = new Autotagger(dir);
+
+        // fuzzy lookup by entity name
+        String name = "the city of Paris";
+
+        // TODO: find an entity where the context can help filter out ambiguity
+        String context = "The river Seine flows in the city of Paris ";
+
+        // strict name lookup (by default)
+        List<TagInfo> tags = autotagger.suggestForType(name, context, "Place");
+        assertTrue(tags.isEmpty());
+
+        // lax name lookup
+        tags = autotagger.withStrictNameLookup(false).suggestForType(name,
+                context, "Place");
+        assertTrue(!tags.isEmpty());
+        assertEquals("http://dbpedia.org/resource/Paris", tags.get(0).getId());
+        assertEquals("Paris", tags.get(0).getLabel());
+    }
+
+    @Test
+    public void testEntityByNameWithoutContext() throws IOException {
+
+        // build a tagger using a the default DBpedia based index
+        Directory dir = FSDirectory.open(defaultIndexDirectory);
+        Autotagger autotagger = new Autotagger(dir).withStrictNameLookup(false);
+
+        // at least of one of the terms is matching
+        String name = "The city of Paris";
+
+        List<TagInfo> tags = autotagger.suggestForType(name, "Place");
+        assertTrue(!tags.isEmpty());
+        assertEquals("http://dbpedia.org/resource/Paris", tags.get(0).getId());
+
+        // try with non existing name
+        name = "somethingnot referencedin theindex";
+        tags = autotagger.suggestForType(name, "Place");
+        assertTrue(tags.isEmpty());
+    }
+
+}

Added: incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java?rev=1041331&view=auto
==============================================================================
--- incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java (added)
+++ incubator/stanbol/trunk/iks-autotagging/src/test/java/eu/iksproject/autotagging/ModelResamplerTest.java Thu Dec  2 11:30:36 2010
@@ -0,0 +1,105 @@
+package eu.iksproject.autotagging;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.rdf.model.Property;
+import com.hp.hpl.jena.rdf.model.Resource;
+
+import eu.iksproject.autotagging.jena.ModelIndexer;
+import eu.iksproject.autotagging.jena.ModelResampler;
+import eu.iksproject.autotagging.jena.ResourceInfo;
+
+public class ModelResamplerTest {
+
+    protected Model srcModel;
+
+    protected Model targetModel;
+
+    protected Resource[] r;
+
+    protected Property p;
+
+    protected Property score;
+
+    protected Resource personClass;
+
+    protected Property type;
+
+    @Before
+    public void makeModels() {
+        srcModel = ModelFactory.createDefaultModel();
+        targetModel = ModelFactory.createDefaultModel();
+
+        // create properties and resources
+        p = srcModel.createProperty("urn:p");
+        score = srcModel.createProperty(ModelIndexer.POPULARITY_SCORE_PROPERTY);
+        type = srcModel.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
+        personClass = srcModel.createResource("http://dbpedia.org/ontology/Person");
+        r = new Resource[10];
+
+        for (int i = 0; i < r.length; i++) {
+            r[i] = srcModel.createResource(String.format("urn:%d", i));
+            srcModel.add(r[i], type, personClass);
+        }
+
+        // connect resources with p
+        srcModel.add(r[0], p, r[1]);
+        srcModel.add(r[2], p, r[1]);
+        srcModel.add(r[3], p, r[1]);
+        srcModel.add(r[9], p, r[1]);
+
+        srcModel.add(r[0], p, r[2]);
+        srcModel.add(r[4], p, r[2]);
+        srcModel.add(r[9], p, r[2]);
+
+        srcModel.add(r[1], p, r[5]);
+        srcModel.add(r[4], p, r[5]);
+
+        srcModel.add(r[8], p, r[4]);
+
+        srcModel.add(r[4], p, r[8]);
+
+    }
+
+    @Test
+    public void testResampling() throws FileNotFoundException, IOException {
+        ModelResampler sampler = new ModelResampler().withMaxTopResources(2);
+        Iterator<ResourceInfo> samplerIterator = sampler.samplerIterator(
+                srcModel, targetModel);
+
+        assertTrue(samplerIterator.hasNext());
+        ResourceInfo ri = samplerIterator.next();
+        assertEquals(r[1], ri.resource);
+        double r1Score = ri.score.doubleValue();
+        assertEquals(1.0, r1Score, 0.01);
+
+        assertTrue(samplerIterator.hasNext());
+        ri = samplerIterator.next();
+        assertEquals(r[2], ri.resource);
+        double r2Score = ri.score.doubleValue();
+        assertEquals(0.86, r2Score, 0.01);
+
+        assertFalse(samplerIterator.hasNext());
+
+        assertEquals(6, targetModel.size());
+
+        assertTrue(targetModel.contains(r[1], type, personClass));
+        assertTrue(targetModel.contains(r[1], p, r[5]));
+        assertTrue(targetModel.containsLiteral(r[1], score, r1Score));
+
+        assertTrue(targetModel.contains(r[2], type, personClass));
+        assertTrue(targetModel.contains(r[2], p, r[1]));
+        assertTrue(targetModel.containsLiteral(r[2], score, r2Score));
+    }
+}