You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/10/02 08:50:17 UTC

svn commit: r1392756 - in /stanbol/trunk: enhancer/bundlelist/src/main/bundles/ enhancer/engines/tika/ enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/ enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines...

Author: rwesten
Date: Tue Oct  2 06:50:16 2012
New Revision: 1392756

URL: http://svn.apache.org/viewvc?rev=1392756&view=rev
Log:
STANBOL-757: Upgrades Tika to version 1.2 (see comment on the Issue for details)

Modified:
    stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
    stanbol/trunk/enhancer/engines/tika/pom.xml
    stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
    stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
    stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
    stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml
    stanbol/trunk/parent/pom.xml

Modified: stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml (original)
+++ stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml Tue Oct  2 06:50:16 2012
@@ -29,14 +29,14 @@
     <bundle> <!-- Apache Tika core (required by the LangId and TikaEngine) -->
         <groupId>org.apache.tika</groupId>
         <artifactId>tika-core</artifactId>
-        <version>1.1</version>
+        <version>1.2</version>
     </bundle>
   </startLevel>
   <startLevel level="27">
     <bundle> <!-- Apache Tika bundle (required by the TikaEngine) -->
         <groupId>org.apache.tika</groupId>
         <artifactId>tika-bundle</artifactId>
-        <version>1.1</version>
+        <version>1.2</version>
     </bundle>
   </startLevel>
   <!-- Stanbol Enhancer infrastructure and required libraries -->

Modified: stanbol/trunk/enhancer/engines/tika/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/pom.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/tika/pom.xml Tue Oct  2 06:50:16 2012
@@ -60,6 +60,11 @@
             <Export-Package>
               org.apache.stanbol.enhancer.engines.tika;version=${project.version}
             </Export-Package>
+            <!-- Workaround for COMPRESS-199 -->
+            <Embed-Dependency>
+                commons-compress, 
+                xz
+            </Embed-Dependency>
           </instructions>
         </configuration>
       </plugin>
@@ -127,6 +132,18 @@
       <groupId>org.apache.clerezza</groupId>
       <artifactId>rdf.ontologies</artifactId>
     </dependency>
+    <!-- Tika 1.2 requires commons-compress 1.4.1 but this can not be used
+         as OSGI bundle because of COMPRESS-199 -->
+    <dependency>
+        <groupId>org.apache.commons</groupId>
+        <artifactId>commons-compress</artifactId>
+        <version>1.4.1</version>
+    </dependency>
+    <dependency>
+        <groupId>org.tukaani</groupId>
+        <artifactId>xz</artifactId>
+        <version>1.1</version>
+    </dependency>
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>

Modified: stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (original)
+++ stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java Tue Oct  2 06:50:16 2012
@@ -238,7 +238,7 @@ public class TikaEngine 
                 ci.addPart(xhtmlBlobUri,  xhtmlSink.getBlob());
             }
             //add the extracted metadata
-            if(log.isDebugEnabled()){
+            if(log.isInfoEnabled()){
                 for(String name : metadata.names()){
                     log.info("{}: {}",name,Arrays.toString(metadata.getValues(name)));
                 }

Modified: stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java (original)
+++ stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java Tue Oct  2 06:50:16 2012
@@ -165,41 +165,41 @@ public class OntologyMappings implements
         String dc = NamespaceEnum.dc.getNamespace();
         mappings.addMapping(
             new PropertyMapping(dc+"contributor",
-                DublinCore.CONTRIBUTOR,MSOffice.LAST_AUTHOR));
+                DublinCore.CONTRIBUTOR.getName(),MSOffice.LAST_AUTHOR));
         mappings.addMapping(
-            new PropertyMapping(dc+"coverage",DublinCore.COVERAGE));
+            new PropertyMapping(dc+"coverage",DublinCore.COVERAGE.getName()));
         mappings.addMappings(
             new PropertyMapping(dc+"creator",
-                DublinCore.CREATOR,MSOffice.AUTHOR,"initial-creator"));
+                DublinCore.CREATOR.getName(),MSOffice.AUTHOR,"initial-creator"));
         mappings.addMappings( 
-            new PropertyMapping(dc+"description",DublinCore.DESCRIPTION));
+            new PropertyMapping(dc+"description",DublinCore.DESCRIPTION.getName()));
         mappings.addMappings( 
             new PropertyMapping(dc+"format",
-                DublinCore.FORMAT,HttpHeaders.CONTENT_TYPE));
+                DublinCore.FORMAT.getName(),HttpHeaders.CONTENT_TYPE));
         mappings.addMappings( 
-            new PropertyMapping(dc+"identifier",DublinCore.IDENTIFIER));
+            new PropertyMapping(dc+"identifier",DublinCore.IDENTIFIER.getName()));
         mappings.addMappings(
             new PropertyMapping(dc+"language",
-                DublinCore.LANGUAGE,HttpHeaders.CONTENT_LANGUAGE));
+                DublinCore.LANGUAGE.getName(),HttpHeaders.CONTENT_LANGUAGE));
         mappings.addMappings(
             new PropertyMapping(dc+"modified",XSD.dateTime,
-                DublinCore.MODIFIED,"Last-Modified"));
+                DublinCore.MODIFIED.getName(),"Last-Modified"));
         mappings.addMappings( 
             new PropertyMapping(dc+"publisher",
-                DublinCore.PUBLISHER,MSOffice.COMPANY));
+                DublinCore.PUBLISHER.getName(),MSOffice.COMPANY));
         mappings.addMappings( 
-            new PropertyMapping(dc+"relation",DublinCore.RELATION));
+            new PropertyMapping(dc+"relation",DublinCore.RELATION.getName()));
         mappings.addMappings(
-            new PropertyMapping(dc+"rights",DublinCore.RIGHTS));
+            new PropertyMapping(dc+"rights",DublinCore.RIGHTS.getName()));
         mappings.addMappings( 
-            new PropertyMapping(dc+"source",DublinCore.SOURCE));
+            new PropertyMapping(dc+"source",DublinCore.SOURCE.getName()));
         mappings.addMappings( 
             new PropertyMapping(dc+"subject",
-                DublinCore.SUBJECT,MSOffice.KEYWORDS));
+                DublinCore.SUBJECT.getName(),MSOffice.KEYWORDS));
         mappings.addMappings( 
-            new PropertyMapping(dc+"title",DublinCore.TITLE));
+            new PropertyMapping(dc+"title",DublinCore.TITLE.getName()));
         mappings.addMappings( 
-            new PropertyMapping(dc+"type",DublinCore.TYPE));
+            new PropertyMapping(dc+"type",DublinCore.TYPE.getName()));
         mappings.addMappings( 
             new PropertyMapping(dc+"date",XSD.dateTime,DublinCore.DATE.getName()));
         //MS Office -> DC
@@ -211,19 +211,19 @@ public class OntologyMappings implements
     public static void addMediaResourceOntologyMappings(OntologyMappings mappings){
         mappings.addMappings(
             new PropertyMapping(ma+"hasContributor",
-                DublinCore.CONTRIBUTOR,XMPDM.ARTIST.getName(),XMPDM.COMPOSER.getName()));
+                DublinCore.CONTRIBUTOR.getName(),XMPDM.ARTIST.getName(),XMPDM.COMPOSER.getName()));
         mappings.addMapping( 
             new ResourceMapping(ma+"hasLocation",
                 new TypeMapping(ma+"Location"),
-                new PropertyMapping(ma+"locationName",DublinCore.COVERAGE)));
+                new PropertyMapping(ma+"locationName",DublinCore.COVERAGE.getName())));
         mappings.addMappings( 
             new PropertyMapping(ma+"hasCreator",
-                DublinCore.CREATOR,MSOffice.AUTHOR,"initial-creator"));
+                DublinCore.CREATOR.getName(),MSOffice.AUTHOR,"initial-creator"));
         mappings.addMappings( 
-            new PropertyMapping(ma+"description",DublinCore.DESCRIPTION));
+            new PropertyMapping(ma+"description",DublinCore.DESCRIPTION.getName()));
         mappings.addMappings( 
             new PropertyMapping(ma+"hasFormat",
-                DublinCore.FORMAT,HttpHeaders.CONTENT_TYPE));
+                DublinCore.FORMAT.getName(),HttpHeaders.CONTENT_TYPE));
         /*
          * Excerpt of the MA recommendation:
          *   The identifier of a media resource is represented in RDF by the URI 
@@ -231,30 +231,30 @@ public class OntologyMappings implements
          *   identified by several URI, owl:sameAs should be used.
          */
         mappings.addMappings( 
-            new PropertyMapping(OWL.sameAs,RDFS.Resource,DublinCore.IDENTIFIER));
+            new PropertyMapping(OWL.sameAs,RDFS.Resource,DublinCore.IDENTIFIER.getName()));
         mappings.addMappings( 
             new PropertyMapping(ma+"hasLanguage",
-                DublinCore.LANGUAGE,HttpHeaders.CONTENT_LANGUAGE));
+                DublinCore.LANGUAGE.getName(),HttpHeaders.CONTENT_LANGUAGE));
         mappings.addMappings( 
             new PropertyMapping(ma+"editDate",XSD.dateTime,
-                DublinCore.MODIFIED,MSOffice.LAST_SAVED.getName()));
+                DublinCore.MODIFIED.getName(),MSOffice.LAST_SAVED.getName()));
         mappings.addMappings(
-            new PropertyMapping(ma+"hasPublisher",DublinCore.PUBLISHER));
+            new PropertyMapping(ma+"hasPublisher",DublinCore.PUBLISHER.getName()));
         mappings.addMappings( 
-            new PropertyMapping(ma+"hasRelatedResource",DublinCore.RELATION));
+            new PropertyMapping(ma+"hasRelatedResource",DublinCore.RELATION.getName()));
         mappings.addMappings( 
             new PropertyMapping(ma+"copyright",RDFS.Resource,
                 //DC:rights and cc:license
-                DublinCore.RIGHTS,CreativeCommons.LICENSE_LOCATION, CreativeCommons.LICENSE_URL,
+                DublinCore.RIGHTS.getName(),CreativeCommons.LICENSE_LOCATION, CreativeCommons.LICENSE_URL,
                 XMPDM.COPYRIGHT.getName()));
         mappings.addMappings( 
-            new PropertyMapping(ma+"isMemberOf",DublinCore.SOURCE));
+            new PropertyMapping(ma+"isMemberOf",DublinCore.SOURCE.getName()));
         mappings.addMappings( 
             new PropertyMapping(ma+"hasKeyword",
-                DublinCore.SUBJECT,MSOffice.KEYWORDS));
+                DublinCore.SUBJECT.getName(),MSOffice.KEYWORDS));
         mappings.addMappings( 
             new PropertyMapping(ma+"title",
-                DublinCore.TITLE,XMPDM.SCENE.getName(),XMPDM.TAPE_NAME.getName(),
+                DublinCore.TITLE.getName(),XMPDM.SCENE.getName(),XMPDM.TAPE_NAME.getName(),
                 XMPDM.SHOT_NAME.getName()));
         mappings.addMapping(
             new PropertyMapping(ma+"alternativeTitle", XMPDM.ALT_TAPE_NAME.getName()));
@@ -262,13 +262,13 @@ public class OntologyMappings implements
             new PropertyMapping(ma+"mainOriginalTitle", XMPDM.ALBUM.getName()));
         mappings.addMappings( 
             new PropertyMapping(ma+"hasGenre",
-                DublinCore.TYPE,XMPDM.GENRE.getName()));
+                DublinCore.TYPE.getName(),XMPDM.GENRE.getName()));
         mappings.addMappings(
             new PropertyMapping(ma+"creationDate",XSD.dateTime,
                 DublinCore.DATE.getName(),MSOffice.CREATION_DATE.getName(),"created"));
         mappings.addMapping(
             new PropertyMapping(ma+"description", 
-                DublinCore.DESCRIPTION,MSOffice.COMMENTS));
+                DublinCore.DESCRIPTION.getName(),MSOffice.COMMENTS));
         
         mappings.addMappings( 
             new PropertyMapping(ma+"hasContributor",
@@ -400,13 +400,13 @@ public class OntologyMappings implements
         //DC -> SKOS
         mappings.addMappings( 
             new PropertyMapping(SKOS.prefLabel,
-                DublinCore.TITLE));
+                DublinCore.TITLE.getName()));
         mappings.addMappings( 
             new PropertyMapping(SKOS.definition,
-                DublinCore.DESCRIPTION));
+                DublinCore.DESCRIPTION.getName()));
         mappings.addMappings(
             new PropertyMapping(SKOS.notation,
-                DublinCore.IDENTIFIER));
+                DublinCore.IDENTIFIER.getName()));
         //MS Office -> SKOS
         mappings.addMappings( 
             new PropertyMapping(SKOS.note,MSOffice.COMMENTS));
@@ -418,9 +418,9 @@ public class OntologyMappings implements
     public static void addRdfsMappings(OntologyMappings mappings){
         //DC
         mappings.addMappings( 
-            new PropertyMapping(RDFS.label,DublinCore.TITLE));
+            new PropertyMapping(RDFS.label,DublinCore.TITLE.getName()));
         mappings.addMappings( 
-            new PropertyMapping(RDFS.comment,DublinCore.DESCRIPTION,MSOffice.COMMENTS));
+            new PropertyMapping(RDFS.comment,DublinCore.DESCRIPTION.getName(),MSOffice.COMMENTS));
     }
     
     /**

Modified: stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java (original)
+++ stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java Tue Oct  2 06:50:16 2012
@@ -301,7 +301,7 @@ public class TikaEngineTest {
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob, 
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
-            "<title></title>",
+            "<title>\\[jira\\] Commented: \\(TIKA-461\\) RFC822 messages not parsed</title>",
             "<body><p>",
             "Julien Nioche commented on TIKA-461:",
             "I'll have a look at mime4j and try to use it in Tika",
@@ -310,9 +310,12 @@ public class TikaEngineTest {
             "URL: https://issues.apache.org/jira/browse/TIKA-461");
         //no check the extracted metadata!
         //DC
-        verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
+        //STANBOL-757: dc:date no longer added by Tika 1.2 (dc:created is still present)
+        //verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"), null,"message/rfc822");
-        verifyValue(ci, new UriRef(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
+        //STANBOL-757: dc:subject no longer added by Tika1.2 (dc:title is used instead)
+        //verifyValue(ci, new UriRef(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
+        verifyValue(ci, new UriRef(NamespaceEnum.dc+"title"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"creator"), null,"Julien Nioche (JIRA) <ji...@apache.org>");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"created"), XSD.dateTime,"2010-09-06T09:25:34Z");
         
@@ -321,7 +324,8 @@ public class TikaEngineTest {
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasFormat"),null,"message/rfc822");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasCreator"),null,"Julien Nioche (JIRA) <ji...@apache.org>");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasContributor"),null,"Julien Nioche (JIRA) <ji...@apache.org>");
-        verifyValue(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
+        //STANBOL-757: This was present with Tika 1.1 because its mapping from dc:subject 
+//        verifyValue(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
 
         
         //Nepomuk Message
@@ -370,7 +374,7 @@ public class TikaEngineTest {
      * @throws IOException
      * @throws ParseException
      */
-    //@Test deactivated because of TIKA-852
+    @Test 
     public void testMp4() throws EngineException, IOException, ParseException {
         log.info(">>> testMp4 <<<");
         ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
@@ -481,7 +485,7 @@ public class TikaEngineTest {
         verifyValues(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"serbor","moscow-birds","canon-55-250");
         //and finally the mapped DC properties
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"),null,"image/jpeg");
-        verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"),XSD.dateTime,"2009-08-11T09:09:45");
+        verifyValue(ci, new UriRef(NamespaceEnum.dc+"created"),XSD.dateTime,"2009-08-11T09:09:45");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"modified"),XSD.dateTime,"2009-10-02T23:02:49");
         verifyValues(ci, new UriRef(NamespaceEnum.dc+"subject"), null, "serbor","moscow-birds","canon-55-250");
     }

Modified: stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml (original)
+++ stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml Tue Oct  2 06:50:16 2012
@@ -104,12 +104,12 @@
     <bundle>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
-      <version>1.0</version>
+      <version>1.3</version>
     </bundle>
     <bundle>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
-      <version>1.5</version>
+      <version>1.7</version>
     </bundle>
     <bundle>
       <groupId>commons-fileupload</groupId>

Modified: stanbol/trunk/parent/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/parent/pom.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/parent/pom.xml (original)
+++ stanbol/trunk/parent/pom.xml Tue Oct  2 06:50:16 2012
@@ -686,7 +686,8 @@
       <dependency>
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-compress</artifactId>
-        <version>1.0</version>
+        <!-- Unable to use version 1.4+ because of COMPRESS-199 -->
+        <version>1.3</version>
       </dependency>
       <dependency>
         <groupId>commons-logging</groupId>
@@ -696,7 +697,7 @@
       <dependency>
         <groupId>commons-codec</groupId>
         <artifactId>commons-codec</artifactId>
-        <version>1.5</version>
+        <version>1.7</version>
       </dependency>
       <dependency>
         <groupId>commons-fileupload</groupId>
@@ -1149,12 +1150,12 @@
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
-      <version>1.1</version>
+      <version>1.2</version>
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-parsers</artifactId>
-      <version>1.1</version>
+      <version>1.2</version>
     </dependency>    
     <!-- Aperture -->
     <dependency>