You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/10/11 21:00:08 UTC

[3/6] incubator-joshua git commit: updated model copying

updated model copying


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/a948f877
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/a948f877
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/a948f877

Branch: refs/heads/master
Commit: a948f877cd283826a23546041225b48b539755bf
Parents: 8cbfbfe
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Oct 5 15:50:04 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Oct 6 14:15:39 2016 -0400

----------------------------------------------------------------------
 scripts/language-pack/build_lp.sh   |  19 +-
 scripts/language-pack/copy_model.py | 540 +++++++++++++++++++++++++++++++
 2 files changed, 550 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a948f877/scripts/language-pack/build_lp.sh
----------------------------------------------------------------------
diff --git a/scripts/language-pack/build_lp.sh b/scripts/language-pack/build_lp.sh
old mode 100644
new mode 100755
index fc3615f..a51376d
--- a/scripts/language-pack/build_lp.sh
+++ b/scripts/language-pack/build_lp.sh
@@ -20,30 +20,31 @@ if [[ -z $4 ]]; then
     exit 1
 fi
 
-JOSHUA=$(dirname $0/../..)
+set -u
+set -e
+
+JOSHUA=$(dirname $0)/../..
 date=$(date +%Y-%m-%d)
-dest=$langpair/releases/apache-joshua-$langpair-$date
+dest=releases/apache-joshua-$langpair-$date
 source=$(echo $langpair | cut -d- -f1)
 target=$(echo $langpair | cut -d- -f2)
 
 # Create the jar file
-(cd $JOSHUA && mvn compile assembly:single)
+(cd $JOSHUA && mvn clean compile assembly:single)
 
 # Copy over critical infrastructure files
-[[ ! -d "$dest/target" ]] && mkdir "$dest/target"
-[[ ! -d "$dest/bin" ]] && mkdir "$dest/bin"
+[[ ! -d "$dest/target" ]] && mkdir -p "$dest/target"
+[[ ! -d "$dest/bin" ]] && mkdir -p "$dest/bin"
 cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar $dest/target
-cp $JOSHUA/bin/joshua $dest/bin
 
 # Copy over the web demonstration
-cp -a $JOSHUA/demo web
+cp -a $JOSHUA/demo $dest/web
 
 # Create the bundle
 # ... --copy-config-options "-lower-case true -project-case true"
-$JOSHUA/scripts/support/run_bundler.py \
+$JOSHUA/scripts/language-pack/copy_model.py \
     --force \
     --verbose \
-    --root $langpair/$modelno \
     --copy-config-options \
       '-top-n 1 -output-format %S -mark-oovs false -lowercase true -projectcase true' \
     $config \

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a948f877/scripts/language-pack/copy_model.py
----------------------------------------------------------------------
diff --git a/scripts/language-pack/copy_model.py b/scripts/language-pack/copy_model.py
new file mode 100755
index 0000000..1ff55bb
--- /dev/null
+++ b/scripts/language-pack/copy_model.py
@@ -0,0 +1,540 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Combine a set of Joshua configuration and resources into a portable
+directory tree.
+"""
+from __future__ import print_function
+import argparse
+from collections import namedtuple
+import logging
+import os
+import shutil
+import signal
+import stat
+from subprocess import CalledProcessError, Popen, PIPE
+import sys
+
+EXAMPLE = r"""
+Example invocation:
+
+$JOSHUA/scripts/language-pack/copy_model.py \
+  --force \
+  --verbose \
+  --copy-config-options \
+    '-top-n 1 -output-format %S -mark-oovs false' \
+  /path/to/origin/directory/test/model/joshua.config \
+  /path/to/destination/directory
+
+Note: The options included in the value string for the --copy-config-options
+argument can either be Joshua options or options for the
+$JOSHUA/scripts/copy-config.pl script. 
+"""
+
+JOSHUA_PATH = os.environ.get('JOSHUA')
+default_normalizer = os.path.join(JOSHUA_PATH, "scripts/preparation/normalize.pl")
+default_tokenizer = os.path.join(JOSHUA_PATH, "scripts/preparation/tokenize.pl")
+FILE_TYPE_TOKENS = ['lm', 'tm']
+FILE_TYPE_OPTIONS = ['-path', '-lm_file']
+
+OUTPUT_CONFIG_FILE_NAME = 'joshua.config'
+BUNDLE_RUNNER_FILE_NAME = 'joshua'
+BUNDLE_RUNNER_TEXT = """#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Joshua decoder invocation script.
+# 
+# This script takes care of passing arguments to Java and to the
+# Joshua decoder. It changes to the current directory so that paths in 
+# the config file are relative to the current directory. Usage:
+#
+# joshua [-m memory] [Joshua arguments]
+#
+# The default amount of memory is 4gb.
+
+NUM_ARGS=0
+E_OPTERROR=1
+
+## memory usage; default is 4 GB
+mem=4g
+
+if [[ $1 == "-m" ]]; then
+    mem=$2
+    shift
+    shift
+fi
+
+set -u
+
+bundledir=$(dirname $0)
+cd $bundledir   # relative paths are now safe....
+
+exec java -mx${mem} \
+    -Dfile.encoding=utf8 \
+    -Djava.library.path=./lib \
+    -cp ./target/joshua-*-jar-with-dependencies.jar \
+    org.apache.joshua.decoder.JoshuaDecoder -c joshua.config "$@"
+"""
+
+
+LineParts = namedtuple('LineParts', ['config', 'comment'])
+
+
+class PathException(Exception):
+    """Error involving a specified path"""
+    pass
+
+
+class PackingError(Exception):
+    """Error packing a grammar"""
+    pass
+
+
+def error_quit(message):
+    logging.error(message)
+    sys.exit(2)
+
+
+def extract_line_parts(line):
+    """
+    Builds a LineParts object containing tokenized config and comment
+    portions of a config line
+    """
+    config, hash_char, comment = line.partition('#')
+    return LineParts(config=config, comment=comment)
+
+
+def filter_through_copy_config_script(config_text, copy_configs):
+    """
+    Run the config_text through the 'copy-config.pl' script, applying
+    the copy_configs options
+    """
+    cmd = os.path.join(JOSHUA_PATH, "scripts/copy-config.pl") + ' ' + copy_configs
+    logging.info(
+        'Running the copy-config.pl script with the command: ' + cmd
+    )
+    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE)
+    result, err = p.communicate(config_text)
+    if p.returncode != 0:
+        raise CalledProcessError(
+            'Encountered an error running the copy-config.pl script.\n'
+            '  command: %s\n'
+            '  error: %s'
+            % (cmd, err or '')
+        )
+    return result
+
+
+def line_specifies_path(line):
+    """
+    Return True if the line matches the format of a joshua.config line
+    that specifies a file or directory path, and False otherwise.
+
+    >>> line_specifies_path('tm = thrax glue -1 1/data/tune/grammar.glue')
+    True
+    >>> line_specifies_path('tm = moses -owner pt -maxspan 0 -path phrase-table.packed -max-source-len 5')
+    True
+    >>> line_specifies_path('tm = moses pt 0 phrase-table.packed')
+    True
+    >>> line_specifies_path('feature-function = WordPenalty')
+    False
+    >>> line_specifies_path('feature_function = Distortion')
+    False
+    >>> line_specifies_path('feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file expts/systems/es-en/1/lm.kenlm')
+    True
+    >>> line_specifies_path('# Foo')
+    False
+    """
+    line_parts = extract_line_parts(line)
+    if not line_parts.config:
+        return False
+
+    config_tokens = line_parts.config.split()
+    if not config_tokens:
+        return False
+
+    if config_tokens[0] in FILE_TYPE_TOKENS:
+        # The first token is the type of config that would specify a
+        # path.
+        return True
+
+    # Look for tokens that match options indicating a path
+    # using intersection of sets
+    if set(config_tokens) & set(FILE_TYPE_OPTIONS):
+        return True
+
+    return False
+
+
+def validate_path(path):
+    """
+    If the specified path does not exist, quit with an nonzero return
+    code, and log an error
+    """
+    if not os.path.exists(path):
+        raise PathException(
+            'The path "%s" does not exist. Cannot proceed.' % path
+        )
+
+
+def parse_path(config_line):
+    """
+    Given a Joshua config line with no comments, return a path specified
+    by the config.
+
+    >>> parse_path('tm = moses -owner pt -maxspan 0 -path phrase-table.packed -max-source-len 5')
+    'phrase-table.packed'
+    >>> parse_path('tm = moses pt 0 phrase-table.packed')
+    'phrase-table.packed'
+    """
+    config_tokens = config_line.split()
+    # Look for -lm_file or -path option tokens indicating a path
+    # If one of those options is not found, assume the final path is the
+    # final token.
+    path_index = -1
+    for path_opt in FILE_TYPE_OPTIONS:
+        if path_opt in config_tokens:
+            path_index = config_tokens.index(path_opt) + 1
+            break
+
+    return config_tokens[path_index]
+
+
+duplicate_name_counts = {}
+
+
+def get_unique_dest(name):
+    """
+    If file/dir name was previously seen, rename the destination path
+    by incrementing the number if type it has been seen.
+    """
+    global duplicate_name_counts
+    times_seen = duplicate_name_counts.get(name, 0) + 1
+    duplicate_name_counts[name] = times_seen
+    pre_extension, extension = os.path.splitext(name)
+    result = name
+    if times_seen > 1:
+        result = "{0}.{1}{2}".format(pre_extension, times_seen, extension)
+    return result
+
+
+def recursive_copy(src, dest, symlink = False):
+    """
+    Copy the src file or recursively copy the directory rooted at src to
+    dest
+    """
+    if symlink:
+        os.symlink(src, dest)
+    else:
+        if os.path.isdir(src):
+            shutil.copytree(src, dest, True)
+        else:
+            shutil.copy(src, dest)
+
+
+def process_line_containing_path(line, dest_dir, symlink, absolute):
+    """
+    The line has already been determined to contain a path, so generate
+    an operation tuple, and update the config line based on the passed
+    orig_dir and dest_dir
+
+    >>> with open('/tmp/lm.kenlm', 'w') as fh:
+    ...     fh.write('')
+    >>> line = 'feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file ./lm.kenlm'
+
+    >>> process_line_containing_path(line, '/tmp', '/foobar')
+    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    ('feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file lm.kenlm',
+     (<function recursive_copy at ...>,
+      ('/tmp/lm.kenlm', '/foobar/lm.kenlm'),
+      'Making a copy of /tmp/lm.kenlm at /foobar/lm.kenlm'))
+
+    >>> line = 'feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file /tmp/lm.kenlm'
+    >>> process_line_containing_path(line, '/tmp', '/foobar')
+    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    ('feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file lm.2.kenlm',
+     (<function recursive_copy at ...>,
+      ('/tmp/lm.kenlm', '/foobar/lm.2.kenlm'),
+      'Making a copy of /tmp/lm.kenlm at /foobar/lm.2.kenlm'))
+    """
+    #####################
+    # Get the source path
+
+    logging.debug('Looking for a path in the line:\n    %s' % line)
+    line_parts = extract_line_parts(line)
+
+    src_path = parse_path(line_parts.config)
+    logging.debug('* Found path "%s"' % src_path)
+
+    #####################################
+    # Determine a unique destination path
+
+    # Get directory name or file name of source path
+    src_name = os.path.basename(src_path)
+    dest_name = get_unique_dest(src_name)
+
+    #############################################################
+    # Generate an operation tuple to copy from orig_dir to dest_dir
+
+    # Coerce the source path to its absolute path if it's relative
+    full_src_path = os.path.normpath(src_path)
+    validate_path(full_src_path)
+
+    dest_path = os.path.join(dest_dir, 'model', dest_name)
+    operation = (
+        recursive_copy, (full_src_path, dest_path, symlink),
+        'Making a copy of {0} at {1}'.format(full_src_path, dest_path)
+    )
+
+    ########################
+    # Update the config line
+    updated_config = line_parts.config.replace(src_path, dest_path if absolute else os.path.join('model', dest_name))
+    if line_parts.comment:
+        line = '#'.join([updated_config, line_parts.comment])
+    else:
+        line = updated_config
+
+    return line, operation
+
+
+class _PackGrammarPath(str):
+    """
+    Used when parsing command-line arguments to distinguish a grammar
+    to be packed from a grammar to be copied.
+    """
+    pass
+
+
+def handle_args(clargs):
+    """
+    Process the command-line options
+    """
+    class MyParser(argparse.ArgumentParser):
+        def error(self, message):
+            logging.error('ERROR: %s\n' % message)
+            self.print_help()
+            print(EXAMPLE)
+            sys.exit(2)
+
+    # Parse the command-line arguments.
+    parser = MyParser(description='create a Joshua configuration bundle from '
+                                  'an existing configuration and set of files')
+    parser.add_argument(
+        'config', type=argparse.FileType('r'),
+        help='path to the origin configuration file. e.g. '
+             '/path/to/tune/dir/joshua.config.final'
+    )
+    parser.add_argument(
+        'dest_dir',
+        help='destination directory, which should not already exist. But if '
+             'it does, it will be removed if -f is used.'
+    )
+    parser.add_argument(
+        '-f', '--force', action='store_true',
+        help='extant destination directory will be overwritten'
+    )
+    parser.add_argument(
+        '-o', '--copy-config-options', default='-top-n 0 -output-format %S -mark-oovs false',
+        help='optional additional or replacement configuration options for '
+             'Joshua, all surrounded by one pair of quotes. Defaults to '
+             ' \'-top-n 0 -output-format %%S -mark-oovs false\''
+    )
+    parser.add_argument(
+        '--server-port', dest='server_port', type=int, default=5674,
+        help='specify the port to be used when running Joshua as a server'
+    )
+    parser.add_argument(
+        '-v', '--verbose', action='store_true',
+        help='print informational messages'
+    )
+    parser.add_argument(
+        '--no-comments', dest='suppress_comments', action='store_true',
+        help="delete comments and multiple consecutive empty lines")
+    parser.add_argument(
+        '--symlink', dest='symlink', action='store_true',
+        help="symlink (where possible) to TM and LM files, instead of copying them")
+    parser.add_argument(
+        '--absolute', dest='absolute', action='store_true', default=False,
+        help="Use absolute instead of relative paths for model file locations")
+    parser.add_argument(
+        '--source', dest='source',
+        help="Source language two-character code (ISO 639-1)")
+    parser.add_argument(
+        '--normalizer', default=default_normalizer,
+        help="source sentence normalizer that was applied to the model")
+    parser.add_argument(
+        '--tokenizer', default=default_tokenizer,
+        help="source sentence tokenizer that was applied to the model")
+    parser.add_argument(
+        '-T', dest='tmpdir', default='/tmp',
+        help="temp directory")
+
+    return parser.parse_args(clargs)
+
+
+def write_string_to_file(path, text):
+    """
+    Write the file at the specified path with the given lines
+    """
+    with open(path, 'w') as fh:
+        fh.write(text)
+
+
+def collect_operations(opts):
+    """
+    Produce a list of operations to take.
+
+    Each element in the operations list is in the format:
+      (function, (arguments,), 'logging message')
+    """
+    operations = []
+
+    #######################
+    # Destination directory
+    if os.path.exists(opts.dest_dir):
+        if not opts.force:
+            raise Exception(
+                'ERROR: The destination directory exists: "%s"\n'
+                'Use -f or --force option to overwrite the directory.'
+                % opts.dest_dir
+            )
+        else:
+            operations.append(
+                (shutil.rmtree, (opts.dest_dir,),
+                 'Forcing deletion of existing destination directory "%s"'
+                 % opts.dest_dir)
+            )
+
+    operations.append(
+        (os.makedirs, (os.path.join(opts.dest_dir, 'model'),),
+         'Creating destination directory "%s"' % opts.dest_dir)
+    )
+
+    ##########################
+    # Input joshua.config file
+    config_text = opts.config.read()
+    if opts.copy_config_options:
+        config_text = filter_through_copy_config_script(
+            config_text,
+            opts.copy_config_options
+        )
+
+    config_lines = config_text.split('\n')
+
+    ###############
+    # Files to copy
+    # Parse the joshua.config and collect copy operations
+    result_config_lines = []
+    for i, line in enumerate(config_lines):
+        line_num = i + 1
+
+        if line_specifies_path(line):
+            try:
+                line, operation = process_line_containing_path(
+                    line, opts.dest_dir, opts.symlink, opts.absolute
+                )
+            except PathException as e:
+                # Prepend the line number to the error message
+                message = (
+                    'ERROR: Configuration file "{0}" line {1}: {2}'
+                    .format(opts.config.name, line_num, e.message)
+                )
+                e.message = message
+                raise e
+            operations.append(operation)
+        result_config_lines.append(line)
+
+    ###########################
+    # Output joshua.config file
+    # Create the Joshua configuration file for the package
+    path = os.path.join(opts.dest_dir, OUTPUT_CONFIG_FILE_NAME)
+    text = '\n'.join(result_config_lines) + '\n'
+    operations.append(
+        (write_string_to_file, (path, text),
+         'Writing the updated joshua.config to %s' % path
+         )
+    )
+
+    #######################
+    # Bundle runner scripts
+    # Write the scripts that run Joshua using the configuration and
+    # resource in the bundle, and make their mode world-readable, and
+    # world-executable.
+    for file_name, file_text in [[BUNDLE_RUNNER_FILE_NAME, BUNDLE_RUNNER_TEXT],]:
+        path = os.path.join(opts.dest_dir, file_name)
+        operations.append(
+            (write_string_to_file, (path, file_text),
+             'Writing the bundle runner file "%s"' % path)
+        )
+        mode = (stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH |
+                stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
+        operations.append(
+            (os.chmod, (path, mode),
+             'Making the bundle runner file executable')
+        )
+
+    return operations
+
+
+def execute_operations(operations):
+    """
+    Execute the list of operations.
+    """
+    for func, args, msg in operations:
+        logging.info(msg)
+        func(*args)
+
+
+def main(argv):
+    global opts
+    opts = handle_args(argv[1:])
+
+    logging.basicConfig(
+        level=logging.DEBUG if opts.verbose else logging.WARNING,
+        format='* %(message)s'
+    )
+
+    try:
+#        validate_path(opts.orig_dir)
+        operations = collect_operations(opts)
+        execute_operations(operations)
+    except Exception as e:
+        error_quit(e.message)
+
+
+if __name__ == "__main__":
+    try:
+        assert JOSHUA_PATH
+    except AssertionError:
+        error_quit('ERROR: The JOSHUA environment variable must be defined.')
+
+    main(sys.argv)