You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by to...@apache.org on 2017/10/16 19:44:29 UTC

svn commit: r1812316 - in /jackrabbit/oak/trunk/oak-search-mt: ./ src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/

Author: tommaso
Date: Mon Oct 16 19:44:29 2017
New Revision: 1812316

URL: http://svn.apache.org/viewvc?rev=1812316&view=rev
Log:
OAK-6837 - failsafe MT setup and execution

Modified:
    jackrabbit/oak/trunk/oak-search-mt/pom.xml
    jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java
    jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java

Modified: jackrabbit/oak/trunk/oak-search-mt/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-mt/pom.xml?rev=1812316&r1=1812315&r2=1812316&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-mt/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-search-mt/pom.xml Mon Oct 16 19:44:29 2017
@@ -17,116 +17,172 @@
    limitations under the License.
   -->
 
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd ">
-  <modelVersion>4.0.0</modelVersion>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd ">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.jackrabbit</groupId>
+        <artifactId>oak-parent</artifactId>
+        <version>1.8-SNAPSHOT</version>
+        <relativePath>../oak-parent/pom.xml</relativePath>
+    </parent>
+
+    <artifactId>oak-search-mt</artifactId>
+    <name>Oak Search Machine Translation</name>
+    <packaging>bundle</packaging>
+    <description>Machine Translation extension for Oak search</description>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.felix</groupId>
+                <artifactId>maven-bundle-plugin</artifactId>
+                <extensions>true</extensions>
+                <configuration>
+                    <instructions>
+                        <Export-Package>
+                            !*
+                        </Export-Package>
+                        <Embed-Dependency>*;scope=compile</Embed-Dependency>
+                        <Import-Package>
+                            com.ibm.uvm.tools.*;resolution:=optional,
+                            com.sun.jdmk.comm.*;resolution:=optional,
+                            com.sun.net.httpserver.*;resolution:=optional,
+                            edu.uci.ics.*;resolution:=optional,
+                            javax.jms.*;resolution:=optional,
+                            javax.jmdns.*;resolution:=optional,
+                            junit.framework.*;resolution:=optional,
+                            org.apache.commons.collections15.*;resolution:=optional,
+                            org.apache.tools.ant.*;resolution:=optional,
+                            org.apache.tools.ant.types.*;resolution:=optional,
+                            org.easymock.*;resolution:=optional,
+                            org.jmock.core.*;resolution:=optional,
+                            sun.misc.*;resolution:=optional,
+                            EDU.oswego.cs.dl.util.concurrent.*;resolution:=optional,
+                            org.kohsuke.args4j.*;resolution:=optional,
+                            *
+                        </Import-Package>
+                    </instructions>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>baseline</id>
+                        <goals>
+                            <goal>baseline</goal>
+                        </goals>
+                        <phase>pre-integration-test</phase>
+                        <configuration>
+                            <!--
+                              This is required because there's no prior (stable) version of oak-search-mt.
+                              This should be removed post 1.8 release
+                              Anyway nothing is exported therefore it shouldn't be a problem
+                            -->
+                            <skip>true</skip>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.rat</groupId>
+                <artifactId>apache-rat-plugin</artifactId>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <!-- Optional OSGi dependencies, used only when running within OSGi -->
+        <dependency>
+            <groupId>org.osgi</groupId>
+            <artifactId>org.osgi.core</artifactId>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.osgi</groupId>
+            <artifactId>org.osgi.compendium</artifactId>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.osgi</groupId>
+            <artifactId>org.osgi.annotation</artifactId>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.felix</groupId>
+            <artifactId>org.apache.felix.scr.annotations</artifactId>
+            <scope>provided</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.jackrabbit</groupId>
+            <artifactId>oak-lucene</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.joshua</groupId>
+            <artifactId>joshua-incubating</artifactId>
+            <version>6.1</version>
+        </dependency>
+
+        <!-- Logging -->
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <scope>provided</scope>
+        </dependency>
+
+        <!-- Findbugs annotations -->
+        <dependency>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+            <scope>provided</scope>
+        </dependency>
+
+        <!-- Test Dependencies -->
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>1.10.19</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>colt</groupId>
+            <artifactId>colt</artifactId>
+            <version>1.2.0</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.berkeley.nlp</groupId>
+            <artifactId>berkeleylm</artifactId>
+            <version>1.1.2</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-cli</groupId>
+            <artifactId>commons-cli</artifactId>
+            <version>1.2</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+            <version>2.4</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>18.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-math3</artifactId>
+            <version>3.5</version>
+        </dependency>
 
-  <parent>
-    <groupId>org.apache.jackrabbit</groupId>
-    <artifactId>oak-parent</artifactId>
-    <version>1.8-SNAPSHOT</version>
-    <relativePath>../oak-parent/pom.xml</relativePath>
-  </parent>
-
-  <artifactId>oak-search-mt</artifactId>
-  <name>Oak Search Machine Translation</name>
-  <packaging>bundle</packaging>
-  <description>Machine Translation extension for Oak search</description>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <configuration>
-          <instructions>
-            <Export-Package>
-              !*
-            </Export-Package>
-          </instructions>
-        </configuration>
-        <executions>
-          <execution>
-            <id>baseline</id>
-            <goals>
-              <goal>baseline</goal>
-            </goals>
-            <phase>pre-integration-test</phase>
-            <configuration>
-              <!--
-                This is required because there's no prior (stable) version of oak-search-mt.
-                This should be removed post 1.8 release
-                Anyway nothing is exported therefore it shouldn't be a problem
-              -->
-              <skip>true</skip>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.rat</groupId>
-        <artifactId>apache-rat-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
-  <dependencies>
-    <!-- Optional OSGi dependencies, used only when running within OSGi -->
-    <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.core</artifactId>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.compendium</artifactId>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.osgi</groupId>
-      <artifactId>org.osgi.annotation</artifactId>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.felix</groupId>
-      <artifactId>org.apache.felix.scr.annotations</artifactId>
-      <scope>provided</scope>
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.jackrabbit</groupId>
-      <artifactId>oak-lucene</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.joshua</groupId>
-      <artifactId>joshua-incubating</artifactId>
-      <version>6.1</version>
-    </dependency>
-
-    <!-- Logging -->
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-api</artifactId>
-    </dependency>
-
-    <!-- Findbugs annotations -->
-    <dependency>
-      <groupId>com.google.code.findbugs</groupId>
-      <artifactId>jsr305</artifactId>
-    </dependency>
-
-    <!-- Test Dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.mockito</groupId>
-      <artifactId>mockito-core</artifactId>
-      <version>1.10.19</version>
-      <scope>test</scope>
-    </dependency>
-
-  </dependencies>
+    </dependencies>
 </project>
+

Modified: jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java?rev=1812316&r1=1812315&r2=1812316&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java (original)
+++ jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java Mon Oct 16 19:44:29 2017
@@ -19,7 +19,6 @@
 package org.apache.jackrabbit.oak.plugins.index.mt;
 
 import javax.annotation.Nonnull;
-import java.io.IOException;
 import java.io.StringReader;
 import java.util.List;
 import java.util.Set;
@@ -63,30 +62,38 @@ public class MTFulltextQueryTermsProvide
     @Override
     public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) {
         BooleanQuery query = new BooleanQuery();
-        Sentence sentence = new Sentence(text, 0, decoder.getJoshuaConfiguration());
-        Translation translation = decoder.decode(sentence);
-        log.debug("{} decoded into {}", text, translation);
-        // try phrase translation first
-        List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
-        if (!structuredTranslations.isEmpty()) {
-            addTranslations(query, structuredTranslations);
-        } else {
-            // if phrase cannot be translated, perform token by token translation
-            try {
+        try {
+            Sentence sentence = new Sentence(text, text.hashCode(), decoder.getJoshuaConfiguration());
+            Translation translation = decoder.decode(sentence);
+            log.debug("{} decoded into {}", text, translation);
+            query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translation.toString())), BooleanClause.Occur.SHOULD));
+
+
+            // try phrase translation first
+            List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
+            log.debug("found {} structured translations", structuredTranslations.size());
+            if (!structuredTranslations.isEmpty()) {
+                log.debug("phrase translation");
+                addTranslations(query, structuredTranslations);
+            } else {
+                // if phrase cannot be translated, perform token by token translation
+                log.debug("per token translation");
+
                 TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
                 tokenStream.addAttribute(CharTermAttribute.class);
                 tokenStream.reset();
                 while (tokenStream.incrementToken()) {
                     CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
-                    Translation translatedToken = decoder.decode(new Sentence(attribute.toString(), 0,
+                    String source = attribute.toString();
+                    Translation translatedToken = decoder.decode(new Sentence(source, source.hashCode(),
                             decoder.getJoshuaConfiguration()));
                     addTranslations(query, translatedToken.getStructuredTranslations());
                 }
                 tokenStream.end();
-            } catch (IOException e) {
-                throw new RuntimeException(e);
             }
 
+        } catch (Exception e) {
+            log.error("could not translate query", e);
         }
         return query.clauses().size() > 0 ? query : null;
     }
@@ -94,7 +101,10 @@ public class MTFulltextQueryTermsProvide
     private void addTranslations(BooleanQuery query, List<StructuredTranslation> structuredTranslations) {
         for (StructuredTranslation st : structuredTranslations) {
             String translationString = st.getTranslationString();
-            if (st.getTranslationScore() > minScore) {
+            float translationScore = st.getTranslationScore();
+            log.debug("translation {} has score {}", translationString, translationScore);
+            if (translationScore > minScore) {
+                log.debug("translation score for {} is {}", translationString, translationScore);
                 query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translationString)),
                         BooleanClause.Occur.SHOULD));
                 log.debug("added query for translated phrase {}", translationString);
@@ -104,9 +114,9 @@ public class MTFulltextQueryTermsProvide
                 for (List<Integer> wa : st.getTranslationWordAlignments()) {
                     if (!wa.isEmpty()) {
                         String translatedTerm = translationTokens.get(i);
-                        log.debug("added query for translated token {}", translatedTerm);
                         query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translatedTerm)),
                                 BooleanClause.Occur.SHOULD));
+                        log.debug("added query for translated token {}", translatedTerm);
                     }
                     i++;
                 }

Modified: jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java?rev=1812316&r1=1812315&r2=1812316&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java (original)
+++ jackrabbit/oak/trunk/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java Mon Oct 16 19:44:29 2017
@@ -19,7 +19,9 @@
 package org.apache.jackrabbit.oak.plugins.index.mt;
 
 import javax.annotation.Nonnull;
+import java.io.File;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
@@ -34,6 +36,7 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.search.Query;
 import org.slf4j.Logger;
@@ -74,34 +77,50 @@ public class MTFulltextQueryTermsProvide
         String[] nts = PropertiesUtil.toStringArray(config.get(NODE_TYPES), new String[]{"Oak:unstructured"});
         float minScore = (float) PropertiesUtil.toDouble(config.get(MIN_SCORE), DEFAULT_MIN_SCORE);
         log.info("activating MT FulltextQueryTermProvider from Joshua config at {} on {} nodetypes, minScore {}", pathToJoshuaConfig, nts, minScore);
+        Decoder decoder = null;
         try {
-            log.debug("parsing joshua config file");
-            Decoder decoder = Decoder.createDecoder(pathToJoshuaConfig);
-            decoder.getJoshuaConfiguration().use_structured_output = true;
-            decoder.getJoshuaConfiguration().sanityCheck();
+            log.debug("reading joshua config");
+            JoshuaConfiguration configuration = new JoshuaConfiguration();
+            configuration.readConfigFile(pathToJoshuaConfig);
+            configuration.setConfigFilePath(new File(pathToJoshuaConfig).getCanonicalFile().getParent());
+            configuration.use_structured_output = true;
+            decoder = new Decoder(configuration, pathToJoshuaConfig);
             log.debug("decoder initialized");
             Set<String> nodeTypes = new HashSet<>();
             nodeTypes.addAll(Arrays.asList(nts));
             queryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, minScore);
         } catch (Exception e) {
-            throw new RuntimeException(e);
+            log.error("could not initialize MTFulltextQueryTermProvider", e);
+            if (decoder != null) {
+                decoder.cleanUp();
+            }
         }
     }
 
     @Deactivate
     public void deactivate() throws Exception {
-        log.info("clearing resources");
-        queryTermsProvider.clearResources();
+        if (queryTermsProvider != null) {
+            log.debug("clearing resources");
+            queryTermsProvider.clearResources();
+        }
     }
 
     @Override
     public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) {
-        return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition);
+        if (queryTermsProvider != null) {
+            return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition);
+        } else {
+            return null;
+        }
     }
 
     @Nonnull
     @Override
     public Set<String> getSupportedTypes() {
-        return queryTermsProvider.getSupportedTypes();
+        if (queryTermsProvider != null) {
+            return queryTermsProvider.getSupportedTypes();
+        } else {
+            return Collections.emptySet();
+        }
     }
 }