You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by to...@apache.org on 2017/10/16 19:44:29 UTC
svn commit: r1812316 - in /jackrabbit/oak/trunk/oak-search-mt: ./
src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/
Author: tommaso
Date: Mon Oct 16 19:44:29 2017
New Revision: 1812316
URL: http://svn.apache.org/viewvc?rev=1812316&view=rev
Log:
OAK-6837 - failsafe MT setup and execution
Modified:
jackrabbit/oak/trunk/oak-search-mt/pom.xml
jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java
jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java
Modified: jackrabbit/oak/trunk/oak-search-mt/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-mt/pom.xml?rev=1812316&r1=1812315&r2=1812316&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-mt/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-search-mt/pom.xml Mon Oct 16 19:44:29 2017
@@ -17,116 +17,172 @@
limitations under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd ">
- <modelVersion>4.0.0</modelVersion>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd ">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>oak-parent</artifactId>
+ <version>1.8-SNAPSHOT</version>
+ <relativePath>../oak-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>oak-search-mt</artifactId>
+ <name>Oak Search Machine Translation</name>
+ <packaging>bundle</packaging>
+ <description>Machine Translation extension for Oak search</description>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Export-Package>
+ !*
+ </Export-Package>
+ <Embed-Dependency>*;scope=compile</Embed-Dependency>
+ <Import-Package>
+ com.ibm.uvm.tools.*;resolution:=optional,
+ com.sun.jdmk.comm.*;resolution:=optional,
+ com.sun.net.httpserver.*;resolution:=optional,
+ edu.uci.ics.*;resolution:=optional,
+ javax.jms.*;resolution:=optional,
+ javax.jmdns.*;resolution:=optional,
+ junit.framework.*;resolution:=optional,
+ org.apache.commons.collections15.*;resolution:=optional,
+ org.apache.tools.ant.*;resolution:=optional,
+ org.apache.tools.ant.types.*;resolution:=optional,
+ org.easymock.*;resolution:=optional,
+ org.jmock.core.*;resolution:=optional,
+ sun.misc.*;resolution:=optional,
+ EDU.oswego.cs.dl.util.concurrent.*;resolution:=optional,
+ org.kohsuke.args4j.*;resolution:=optional,
+ *
+ </Import-Package>
+ </instructions>
+ </configuration>
+ <executions>
+ <execution>
+ <id>baseline</id>
+ <goals>
+ <goal>baseline</goal>
+ </goals>
+ <phase>pre-integration-test</phase>
+ <configuration>
+ <!--
+ This is required because there's no prior (stable) version of oak-search-mt.
+ This should be removed post 1.8 release
+ Anyway nothing is exported therefore it shouldn't be a problem
+ -->
+ <skip>true</skip>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <!-- Optional OSGi dependencies, used only when running within OSGi -->
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.core</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.compendium</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.annotation</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>oak-lucene</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.joshua</groupId>
+ <artifactId>joshua-incubating</artifactId>
+ <version>6.1</version>
+ </dependency>
+
+ <!-- Logging -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Findbugs annotations -->
+ <dependency>
+ <groupId>com.google.code.findbugs</groupId>
+ <artifactId>jsr305</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Test Dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>1.10.19</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>colt</groupId>
+ <artifactId>colt</artifactId>
+ <version>1.2.0</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.berkeley.nlp</groupId>
+ <artifactId>berkeleylm</artifactId>
+ <version>1.1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>18.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-math3</artifactId>
+ <version>3.5</version>
+ </dependency>
- <parent>
- <groupId>org.apache.jackrabbit</groupId>
- <artifactId>oak-parent</artifactId>
- <version>1.8-SNAPSHOT</version>
- <relativePath>../oak-parent/pom.xml</relativePath>
- </parent>
-
- <artifactId>oak-search-mt</artifactId>
- <name>Oak Search Machine Translation</name>
- <packaging>bundle</packaging>
- <description>Machine Translation extension for Oak search</description>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <configuration>
- <instructions>
- <Export-Package>
- !*
- </Export-Package>
- </instructions>
- </configuration>
- <executions>
- <execution>
- <id>baseline</id>
- <goals>
- <goal>baseline</goal>
- </goals>
- <phase>pre-integration-test</phase>
- <configuration>
- <!--
- This is required because there's no prior (stable) version of oak-search-mt.
- This should be removed post 1.8 release
- Anyway nothing is exported therefore it shouldn't be a problem
- -->
- <skip>true</skip>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <!-- Optional OSGi dependencies, used only when running within OSGi -->
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.core</artifactId>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.compendium</artifactId>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.annotation</artifactId>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.scr.annotations</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.jackrabbit</groupId>
- <artifactId>oak-lucene</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.joshua</groupId>
- <artifactId>joshua-incubating</artifactId>
- <version>6.1</version>
- </dependency>
-
- <!-- Logging -->
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </dependency>
-
- <!-- Findbugs annotations -->
- <dependency>
- <groupId>com.google.code.findbugs</groupId>
- <artifactId>jsr305</artifactId>
- </dependency>
-
- <!-- Test Dependencies -->
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <version>1.10.19</version>
- <scope>test</scope>
- </dependency>
-
- </dependencies>
+ </dependencies>
</project>
+
Modified: jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java?rev=1812316&r1=1812315&r2=1812316&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java (original)
+++ jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java Mon Oct 16 19:44:29 2017
@@ -19,7 +19,6 @@
package org.apache.jackrabbit.oak.plugins.index.mt;
import javax.annotation.Nonnull;
-import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.Set;
@@ -63,30 +62,38 @@ public class MTFulltextQueryTermsProvide
@Override
public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) {
BooleanQuery query = new BooleanQuery();
- Sentence sentence = new Sentence(text, 0, decoder.getJoshuaConfiguration());
- Translation translation = decoder.decode(sentence);
- log.debug("{} decoded into {}", text, translation);
- // try phrase translation first
- List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
- if (!structuredTranslations.isEmpty()) {
- addTranslations(query, structuredTranslations);
- } else {
- // if phrase cannot be translated, perform token by token translation
- try {
+ try {
+ Sentence sentence = new Sentence(text, text.hashCode(), decoder.getJoshuaConfiguration());
+ Translation translation = decoder.decode(sentence);
+ log.debug("{} decoded into {}", text, translation);
+ query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translation.toString())), BooleanClause.Occur.SHOULD));
+
+
+ // try phrase translation first
+ List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
+ log.debug("found {} structured translations", structuredTranslations.size());
+ if (!structuredTranslations.isEmpty()) {
+ log.debug("phrase translation");
+ addTranslations(query, structuredTranslations);
+ } else {
+ // if phrase cannot be translated, perform token by token translation
+ log.debug("per token translation");
+
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
- Translation translatedToken = decoder.decode(new Sentence(attribute.toString(), 0,
+ String source = attribute.toString();
+ Translation translatedToken = decoder.decode(new Sentence(source, source.hashCode(),
decoder.getJoshuaConfiguration()));
addTranslations(query, translatedToken.getStructuredTranslations());
}
tokenStream.end();
- } catch (IOException e) {
- throw new RuntimeException(e);
}
+ } catch (Exception e) {
+ log.error("could not translate query", e);
}
return query.clauses().size() > 0 ? query : null;
}
@@ -94,7 +101,10 @@ public class MTFulltextQueryTermsProvide
private void addTranslations(BooleanQuery query, List<StructuredTranslation> structuredTranslations) {
for (StructuredTranslation st : structuredTranslations) {
String translationString = st.getTranslationString();
- if (st.getTranslationScore() > minScore) {
+ float translationScore = st.getTranslationScore();
+ log.debug("translation {} has score {}", translationString, translationScore);
+ if (translationScore > minScore) {
+ log.debug("translation score for {}Â is {}", translationString, translationScore);
query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translationString)),
BooleanClause.Occur.SHOULD));
log.debug("added query for translated phrase {}", translationString);
@@ -104,9 +114,9 @@ public class MTFulltextQueryTermsProvide
for (List<Integer> wa : st.getTranslationWordAlignments()) {
if (!wa.isEmpty()) {
String translatedTerm = translationTokens.get(i);
- log.debug("added query for translated token {}", translatedTerm);
query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translatedTerm)),
BooleanClause.Occur.SHOULD));
+ log.debug("added query for translated token {}", translatedTerm);
}
i++;
}
Modified: jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java?rev=1812316&r1=1812315&r2=1812316&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java (original)
+++ jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java Mon Oct 16 19:44:29 2017
@@ -19,7 +19,9 @@
package org.apache.jackrabbit.oak.plugins.index.mt;
import javax.annotation.Nonnull;
+import java.io.File;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@@ -34,6 +36,7 @@ import org.apache.jackrabbit.oak.commons
import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.Query;
import org.slf4j.Logger;
@@ -74,34 +77,50 @@ public class MTFulltextQueryTermsProvide
String[] nts = PropertiesUtil.toStringArray(config.get(NODE_TYPES), new String[]{"Oak:unstructured"});
float minScore = (float) PropertiesUtil.toDouble(config.get(MIN_SCORE), DEFAULT_MIN_SCORE);
log.info("activating MT FulltextQueryTermProvider from Joshua config at {} on {} nodetypes, minScore {}", pathToJoshuaConfig, nts, minScore);
+ Decoder decoder = null;
try {
- log.debug("parsing joshua config file");
- Decoder decoder = Decoder.createDecoder(pathToJoshuaConfig);
- decoder.getJoshuaConfiguration().use_structured_output = true;
- decoder.getJoshuaConfiguration().sanityCheck();
+ log.debug("reading joshua config");
+ JoshuaConfiguration configuration = new JoshuaConfiguration();
+ configuration.readConfigFile(pathToJoshuaConfig);
+ configuration.setConfigFilePath(new File(pathToJoshuaConfig).getCanonicalFile().getParent());
+ configuration.use_structured_output = true;
+ decoder = new Decoder(configuration, pathToJoshuaConfig);
log.debug("decoder initialized");
Set<String> nodeTypes = new HashSet<>();
nodeTypes.addAll(Arrays.asList(nts));
queryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, minScore);
} catch (Exception e) {
- throw new RuntimeException(e);
+ log.error("could not initialize MTFulltextQueryTermProvider", e);
+ if (decoder != null) {
+ decoder.cleanUp();
+ }
}
}
@Deactivate
public void deactivate() throws Exception {
- log.info("clearing resources");
- queryTermsProvider.clearResources();
+ if (queryTermsProvider != null) {
+ log.debug("clearing resources");
+ queryTermsProvider.clearResources();
+ }
}
@Override
public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) {
- return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition);
+ if (queryTermsProvider != null) {
+ return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition);
+ } else {
+ return null;
+ }
}
@Nonnull
@Override
public Set<String> getSupportedTypes() {
- return queryTermsProvider.getSupportedTypes();
+ if (queryTermsProvider != null) {
+ return queryTermsProvider.getSupportedTypes();
+ } else {
+ return Collections.emptySet();
+ }
}
}