You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/10/02 08:50:17 UTC
svn commit: r1392756 - in /stanbol/trunk:
enhancer/bundlelist/src/main/bundles/ enhancer/engines/tika/
enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/
enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines...
Author: rwesten
Date: Tue Oct 2 06:50:16 2012
New Revision: 1392756
URL: http://svn.apache.org/viewvc?rev=1392756&view=rev
Log:
STANBOL-757: Upgrades Tika to version 1.2 (see comment on the Issue for details)
Modified:
stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
stanbol/trunk/enhancer/engines/tika/pom.xml
stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml
stanbol/trunk/parent/pom.xml
Modified: stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml (original)
+++ stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml Tue Oct 2 06:50:16 2012
@@ -29,14 +29,14 @@
<bundle> <!-- Apache Tika core (required by the LangId and TikaEngine) -->
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>1.1</version>
+ <version>1.2</version>
</bundle>
</startLevel>
<startLevel level="27">
<bundle> <!-- Apache Tika bundle (required by the TikaEngine) -->
<groupId>org.apache.tika</groupId>
<artifactId>tika-bundle</artifactId>
- <version>1.1</version>
+ <version>1.2</version>
</bundle>
</startLevel>
<!-- Stanbol Enhancer infrastructure and required libraries -->
Modified: stanbol/trunk/enhancer/engines/tika/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/pom.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/tika/pom.xml Tue Oct 2 06:50:16 2012
@@ -60,6 +60,11 @@
<Export-Package>
org.apache.stanbol.enhancer.engines.tika;version=${project.version}
</Export-Package>
+ <!-- Workaround for COMPRESS-199 -->
+ <Embed-Dependency>
+ commons-compress,
+ xz
+ </Embed-Dependency>
</instructions>
</configuration>
</plugin>
@@ -127,6 +132,18 @@
<groupId>org.apache.clerezza</groupId>
<artifactId>rdf.ontologies</artifactId>
</dependency>
+ <!-- Tika 1.2 requires commons-compress 1.4.1 but this can not be used
+ as OSGI bundle because of COMPRESS-199 -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>1.4.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.tukaani</groupId>
+ <artifactId>xz</artifactId>
+ <version>1.1</version>
+ </dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
Modified: stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (original)
+++ stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java Tue Oct 2 06:50:16 2012
@@ -238,7 +238,7 @@ public class TikaEngine
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
- if(log.isDebugEnabled()){
+ if(log.isInfoEnabled()){
for(String name : metadata.names()){
log.info("{}: {}",name,Arrays.toString(metadata.getValues(name)));
}
Modified: stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java (original)
+++ stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java Tue Oct 2 06:50:16 2012
@@ -165,41 +165,41 @@ public class OntologyMappings implements
String dc = NamespaceEnum.dc.getNamespace();
mappings.addMapping(
new PropertyMapping(dc+"contributor",
- DublinCore.CONTRIBUTOR,MSOffice.LAST_AUTHOR));
+ DublinCore.CONTRIBUTOR.getName(),MSOffice.LAST_AUTHOR));
mappings.addMapping(
- new PropertyMapping(dc+"coverage",DublinCore.COVERAGE));
+ new PropertyMapping(dc+"coverage",DublinCore.COVERAGE.getName()));
mappings.addMappings(
new PropertyMapping(dc+"creator",
- DublinCore.CREATOR,MSOffice.AUTHOR,"initial-creator"));
+ DublinCore.CREATOR.getName(),MSOffice.AUTHOR,"initial-creator"));
mappings.addMappings(
- new PropertyMapping(dc+"description",DublinCore.DESCRIPTION));
+ new PropertyMapping(dc+"description",DublinCore.DESCRIPTION.getName()));
mappings.addMappings(
new PropertyMapping(dc+"format",
- DublinCore.FORMAT,HttpHeaders.CONTENT_TYPE));
+ DublinCore.FORMAT.getName(),HttpHeaders.CONTENT_TYPE));
mappings.addMappings(
- new PropertyMapping(dc+"identifier",DublinCore.IDENTIFIER));
+ new PropertyMapping(dc+"identifier",DublinCore.IDENTIFIER.getName()));
mappings.addMappings(
new PropertyMapping(dc+"language",
- DublinCore.LANGUAGE,HttpHeaders.CONTENT_LANGUAGE));
+ DublinCore.LANGUAGE.getName(),HttpHeaders.CONTENT_LANGUAGE));
mappings.addMappings(
new PropertyMapping(dc+"modified",XSD.dateTime,
- DublinCore.MODIFIED,"Last-Modified"));
+ DublinCore.MODIFIED.getName(),"Last-Modified"));
mappings.addMappings(
new PropertyMapping(dc+"publisher",
- DublinCore.PUBLISHER,MSOffice.COMPANY));
+ DublinCore.PUBLISHER.getName(),MSOffice.COMPANY));
mappings.addMappings(
- new PropertyMapping(dc+"relation",DublinCore.RELATION));
+ new PropertyMapping(dc+"relation",DublinCore.RELATION.getName()));
mappings.addMappings(
- new PropertyMapping(dc+"rights",DublinCore.RIGHTS));
+ new PropertyMapping(dc+"rights",DublinCore.RIGHTS.getName()));
mappings.addMappings(
- new PropertyMapping(dc+"source",DublinCore.SOURCE));
+ new PropertyMapping(dc+"source",DublinCore.SOURCE.getName()));
mappings.addMappings(
new PropertyMapping(dc+"subject",
- DublinCore.SUBJECT,MSOffice.KEYWORDS));
+ DublinCore.SUBJECT.getName(),MSOffice.KEYWORDS));
mappings.addMappings(
- new PropertyMapping(dc+"title",DublinCore.TITLE));
+ new PropertyMapping(dc+"title",DublinCore.TITLE.getName()));
mappings.addMappings(
- new PropertyMapping(dc+"type",DublinCore.TYPE));
+ new PropertyMapping(dc+"type",DublinCore.TYPE.getName()));
mappings.addMappings(
new PropertyMapping(dc+"date",XSD.dateTime,DublinCore.DATE.getName()));
//MS Office -> DC
@@ -211,19 +211,19 @@ public class OntologyMappings implements
public static void addMediaResourceOntologyMappings(OntologyMappings mappings){
mappings.addMappings(
new PropertyMapping(ma+"hasContributor",
- DublinCore.CONTRIBUTOR,XMPDM.ARTIST.getName(),XMPDM.COMPOSER.getName()));
+ DublinCore.CONTRIBUTOR.getName(),XMPDM.ARTIST.getName(),XMPDM.COMPOSER.getName()));
mappings.addMapping(
new ResourceMapping(ma+"hasLocation",
new TypeMapping(ma+"Location"),
- new PropertyMapping(ma+"locationName",DublinCore.COVERAGE)));
+ new PropertyMapping(ma+"locationName",DublinCore.COVERAGE.getName())));
mappings.addMappings(
new PropertyMapping(ma+"hasCreator",
- DublinCore.CREATOR,MSOffice.AUTHOR,"initial-creator"));
+ DublinCore.CREATOR.getName(),MSOffice.AUTHOR,"initial-creator"));
mappings.addMappings(
- new PropertyMapping(ma+"description",DublinCore.DESCRIPTION));
+ new PropertyMapping(ma+"description",DublinCore.DESCRIPTION.getName()));
mappings.addMappings(
new PropertyMapping(ma+"hasFormat",
- DublinCore.FORMAT,HttpHeaders.CONTENT_TYPE));
+ DublinCore.FORMAT.getName(),HttpHeaders.CONTENT_TYPE));
/*
* Excerpt of the MA recommendation:
* The identifier of a media resource is represented in RDF by the URI
@@ -231,30 +231,30 @@ public class OntologyMappings implements
* identified by several URI, owl:sameAs should be used.
*/
mappings.addMappings(
- new PropertyMapping(OWL.sameAs,RDFS.Resource,DublinCore.IDENTIFIER));
+ new PropertyMapping(OWL.sameAs,RDFS.Resource,DublinCore.IDENTIFIER.getName()));
mappings.addMappings(
new PropertyMapping(ma+"hasLanguage",
- DublinCore.LANGUAGE,HttpHeaders.CONTENT_LANGUAGE));
+ DublinCore.LANGUAGE.getName(),HttpHeaders.CONTENT_LANGUAGE));
mappings.addMappings(
new PropertyMapping(ma+"editDate",XSD.dateTime,
- DublinCore.MODIFIED,MSOffice.LAST_SAVED.getName()));
+ DublinCore.MODIFIED.getName(),MSOffice.LAST_SAVED.getName()));
mappings.addMappings(
- new PropertyMapping(ma+"hasPublisher",DublinCore.PUBLISHER));
+ new PropertyMapping(ma+"hasPublisher",DublinCore.PUBLISHER.getName()));
mappings.addMappings(
- new PropertyMapping(ma+"hasRelatedResource",DublinCore.RELATION));
+ new PropertyMapping(ma+"hasRelatedResource",DublinCore.RELATION.getName()));
mappings.addMappings(
new PropertyMapping(ma+"copyright",RDFS.Resource,
//DC:rights and cc:license
- DublinCore.RIGHTS,CreativeCommons.LICENSE_LOCATION, CreativeCommons.LICENSE_URL,
+ DublinCore.RIGHTS.getName(),CreativeCommons.LICENSE_LOCATION, CreativeCommons.LICENSE_URL,
XMPDM.COPYRIGHT.getName()));
mappings.addMappings(
- new PropertyMapping(ma+"isMemberOf",DublinCore.SOURCE));
+ new PropertyMapping(ma+"isMemberOf",DublinCore.SOURCE.getName()));
mappings.addMappings(
new PropertyMapping(ma+"hasKeyword",
- DublinCore.SUBJECT,MSOffice.KEYWORDS));
+ DublinCore.SUBJECT.getName(),MSOffice.KEYWORDS));
mappings.addMappings(
new PropertyMapping(ma+"title",
- DublinCore.TITLE,XMPDM.SCENE.getName(),XMPDM.TAPE_NAME.getName(),
+ DublinCore.TITLE.getName(),XMPDM.SCENE.getName(),XMPDM.TAPE_NAME.getName(),
XMPDM.SHOT_NAME.getName()));
mappings.addMapping(
new PropertyMapping(ma+"alternativeTitle", XMPDM.ALT_TAPE_NAME.getName()));
@@ -262,13 +262,13 @@ public class OntologyMappings implements
new PropertyMapping(ma+"mainOriginalTitle", XMPDM.ALBUM.getName()));
mappings.addMappings(
new PropertyMapping(ma+"hasGenre",
- DublinCore.TYPE,XMPDM.GENRE.getName()));
+ DublinCore.TYPE.getName(),XMPDM.GENRE.getName()));
mappings.addMappings(
new PropertyMapping(ma+"creationDate",XSD.dateTime,
DublinCore.DATE.getName(),MSOffice.CREATION_DATE.getName(),"created"));
mappings.addMapping(
new PropertyMapping(ma+"description",
- DublinCore.DESCRIPTION,MSOffice.COMMENTS));
+ DublinCore.DESCRIPTION.getName(),MSOffice.COMMENTS));
mappings.addMappings(
new PropertyMapping(ma+"hasContributor",
@@ -400,13 +400,13 @@ public class OntologyMappings implements
//DC -> SKOS
mappings.addMappings(
new PropertyMapping(SKOS.prefLabel,
- DublinCore.TITLE));
+ DublinCore.TITLE.getName()));
mappings.addMappings(
new PropertyMapping(SKOS.definition,
- DublinCore.DESCRIPTION));
+ DublinCore.DESCRIPTION.getName()));
mappings.addMappings(
new PropertyMapping(SKOS.notation,
- DublinCore.IDENTIFIER));
+ DublinCore.IDENTIFIER.getName()));
//MS Office -> SKOS
mappings.addMappings(
new PropertyMapping(SKOS.note,MSOffice.COMMENTS));
@@ -418,9 +418,9 @@ public class OntologyMappings implements
public static void addRdfsMappings(OntologyMappings mappings){
//DC
mappings.addMappings(
- new PropertyMapping(RDFS.label,DublinCore.TITLE));
+ new PropertyMapping(RDFS.label,DublinCore.TITLE.getName()));
mappings.addMappings(
- new PropertyMapping(RDFS.comment,DublinCore.DESCRIPTION,MSOffice.COMMENTS));
+ new PropertyMapping(RDFS.comment,DublinCore.DESCRIPTION.getName(),MSOffice.COMMENTS));
}
/**
Modified: stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java (original)
+++ stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java Tue Oct 2 06:50:16 2012
@@ -301,7 +301,7 @@ public class TikaEngineTest {
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob,
"<html xmlns=\"http://www.w3.org/1999/xhtml\">",
- "<title></title>",
+ "<title>\\[jira\\] Commented: \\(TIKA-461\\) RFC822 messages not parsed</title>",
"<body><p>",
"Julien Nioche commented on TIKA-461:",
"I'll have a look at mime4j and try to use it in Tika",
@@ -310,9 +310,12 @@ public class TikaEngineTest {
"URL: https://issues.apache.org/jira/browse/TIKA-461");
//no check the extracted metadata!
//DC
- verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
+ //STANBOL-757: dc:date no longer added by Tika 1.2 (dc:created is still present)
+ //verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"), null,"message/rfc822");
- verifyValue(ci, new UriRef(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
+ //STANBOL-757: dc:subject no longer added by Tika1.2 (dc:title is used instead)
+ //verifyValue(ci, new UriRef(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
+ verifyValue(ci, new UriRef(NamespaceEnum.dc+"title"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
verifyValue(ci, new UriRef(NamespaceEnum.dc+"creator"), null,"Julien Nioche (JIRA) <ji...@apache.org>");
verifyValue(ci, new UriRef(NamespaceEnum.dc+"created"), XSD.dateTime,"2010-09-06T09:25:34Z");
@@ -321,7 +324,8 @@ public class TikaEngineTest {
verifyValue(ci, new UriRef(NamespaceEnum.media+"hasFormat"),null,"message/rfc822");
verifyValue(ci, new UriRef(NamespaceEnum.media+"hasCreator"),null,"Julien Nioche (JIRA) <ji...@apache.org>");
verifyValue(ci, new UriRef(NamespaceEnum.media+"hasContributor"),null,"Julien Nioche (JIRA) <ji...@apache.org>");
- verifyValue(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
+ //STANBOL-757: This was present with Tika 1.1 because its mapping from dc:subject
+// verifyValue(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
//Nepomuk Message
@@ -370,7 +374,7 @@ public class TikaEngineTest {
* @throws IOException
* @throws ParseException
*/
- //@Test deactivated because of TIKA-852
+ @Test
public void testMp4() throws EngineException, IOException, ParseException {
log.info(">>> testMp4 <<<");
ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
@@ -481,7 +485,7 @@ public class TikaEngineTest {
verifyValues(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"serbor","moscow-birds","canon-55-250");
//and finally the mapped DC properties
verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"),null,"image/jpeg");
- verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"),XSD.dateTime,"2009-08-11T09:09:45");
+ verifyValue(ci, new UriRef(NamespaceEnum.dc+"created"),XSD.dateTime,"2009-08-11T09:09:45");
verifyValue(ci, new UriRef(NamespaceEnum.dc+"modified"),XSD.dateTime,"2009-10-02T23:02:49");
verifyValues(ci, new UriRef(NamespaceEnum.dc+"subject"), null, "serbor","moscow-birds","canon-55-250");
}
Modified: stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml (original)
+++ stanbol/trunk/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml Tue Oct 2 06:50:16 2012
@@ -104,12 +104,12 @@
<bundle>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
- <version>1.0</version>
+ <version>1.3</version>
</bundle>
<bundle>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
- <version>1.5</version>
+ <version>1.7</version>
</bundle>
<bundle>
<groupId>commons-fileupload</groupId>
Modified: stanbol/trunk/parent/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/parent/pom.xml?rev=1392756&r1=1392755&r2=1392756&view=diff
==============================================================================
--- stanbol/trunk/parent/pom.xml (original)
+++ stanbol/trunk/parent/pom.xml Tue Oct 2 06:50:16 2012
@@ -686,7 +686,8 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
- <version>1.0</version>
+ <!-- Unable to use version 1.4+ because of COMPRESS-199 -->
+ <version>1.3</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
@@ -696,7 +697,7 @@
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
- <version>1.5</version>
+ <version>1.7</version>
</dependency>
<dependency>
<groupId>commons-fileupload</groupId>
@@ -1149,12 +1150,12 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>1.1</version>
+ <version>1.2</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
- <version>1.1</version>
+ <version>1.2</version>
</dependency>
<!-- Aperture -->
<dependency>