You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/09 05:10:49 UTC
[40/44] incubator-joshua-site git commit: First attempt
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/thrax.html
----------------------------------------------------------------------
diff --git a/4.0/thrax.html b/4.0/thrax.html
new file mode 100644
index 0000000..87ec518
--- /dev/null
+++ b/4.0/thrax.html
@@ -0,0 +1,264 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+ <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" />
+ <title>Joshua | Grammar extraction with Thrax</title>
+ </head>
+
+ <body>
+
+ <div id="navbar">
+ <a href="http://joshua-decoder.org/">
+ <img src="../images/joshua-logo-small.png" width="130px"
+ alt="Joshua logo (picture of a Joshua tree)" />
+ </a>
+
+ <p class="infobox">
+ <b>Stable version</b><br />
+ 4.1<br/><br/>
+ <b>Release date</b><br />
+ 2013 January
+ </p>
+
+<!-- <div class="infobox"> -->
+<!-- <b>AUTO LINKS</b><br/> -->
+<!-- <ul> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Building a language pack</li> -->
+<!-- -->
+<!-- <li> Building a language pack</li> -->
+<!-- -->
+<!-- <li> Bundling a configuration</li> -->
+<!-- -->
+<!-- <li> Contributors</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Frequently Asked Questions</li> -->
+<!-- -->
+<!-- <li> Common problems</li> -->
+<!-- -->
+<!-- <li> Frequently Asked Questions</li> -->
+<!-- -->
+<!-- <li> Common problems</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> -->
+<!-- -->
+<!-- <li> Indian Languages Parallel Corpora</li> -->
+<!-- -->
+<!-- <li> Joshua 4.0 User Documentation</li> -->
+<!-- -->
+<!-- <li> Language packs</li> -->
+<!-- -->
+<!-- <li> Paraphrase Packs</li> -->
+<!-- -->
+<!-- <li> Joshua releases</li> -->
+<!-- -->
+<!-- <li> Support</li> -->
+<!-- -->
+<!-- <li> Getting Started</li> -->
+<!-- -->
+<!-- <li> Welcome to Joshua</li> -->
+<!-- -->
+<!-- <li> Joshua documentation</li> -->
+<!-- -->
+<!-- <li> Joshua documentation</li> -->
+<!-- -->
+<!-- <li> Installation</li> -->
+<!-- -->
+<!-- <li> Installation</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Lattice decoding</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> Quick Start</li> -->
+<!-- -->
+<!-- <li> Quick Start</li> -->
+<!-- -->
+<!-- <li> Releases</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Installing and running the Joshua Decoder</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> What's New</li> -->
+<!-- -->
+<!-- <li> What's New</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- </ul> -->
+<!-- </div> -->
+
+ <div class="infobox">
+
+ <b>Links</b><br />
+ <ul>
+ <li> <a href="../index.html">Main</a> </li>
+ <li> <a href="pipeline.html">Pipeline</a> </li>
+ <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li>
+ <li> <a href="decoder.html">Decoder</a> </li>
+ <li> <a href="server.html">Decoder Server</a> </li>
+ <li> <a href="file-formats.html">File formats</a> </li>
+ <li> <a href="thrax.html">Grammar Extraction</a> </li>
+ <li> <a href="../releases.html">Releases</a> </li>
+ </ul>
+ </div>
+
+ <div class="infobox">
+ <b>Advanced</b><br />
+ <ul>
+<!-- <li> <a href="packing.html">Grammar packing</a> </li> -->
+ <li> <a href="large-lms.html">Building large LMs</a> </li>
+ <li> <a href="zmert.html">Running Z-MERT</a> </li>
+ <li> <a href="lattice.html">Lattices</a> </li>
+ <li> <a href="server.html">TCP/IP server</a> </li>
+ <li> <a href="bundle.html">Bundled configuration</a> </li>
+ </ul>
+ </div>
+
+ <div class="infobox">
+ <b>Help</b><br />
+ <ul>
+ <li> <a href="faq.html">Answers</a> </li>
+ <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li>
+ </ul>
+ </div>
+
+ <div class="footer">
+ Last updated on April 08, 2016
+ </div>
+
+ </div>
+
+ <div id="main">
+ <div id="title">
+ <h1>Grammar extraction with Thrax</h1>
+ </div>
+
+ <div id="content">
+
+ <p>One day, this will hold Thrax documentation, including how to use Thrax, how to do grammar
+filtering, and details on the configuration file options. It will also include details about our
+experience setting up and maintaining Hadoop cluster installations, knowledge wrought of hard-fought
+sweat and tears.</p>
+
+<p>In the meantime, please bother <a href="http://cs.jhu.edu/~jonny/">Jonny Weese</a> if there is something you
+need to do that you don’t understand. You might also be able to dig up some information <a href="http://cs.jhu.edu/~jonny/thrax/">on the old
+Thrax page</a>.</p>
+
+
+ </div>
+ </div>
+
+ </body>
+</html>
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/thrax.md
----------------------------------------------------------------------
diff --git a/4.0/thrax.md b/4.0/thrax.md
deleted file mode 100644
index 6b276b0..0000000
--- a/4.0/thrax.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-layout: default4
-category: advanced
-title: Grammar extraction with Thrax
----
-
-One day, this will hold Thrax documentation, including how to use Thrax, how to do grammar
-filtering, and details on the configuration file options. It will also include details about our
-experience setting up and maintaining Hadoop cluster installations, knowledge wrought of hard-fought
-sweat and tears.
-
-In the meantime, please bother [Jonny Weese](http://cs.jhu.edu/~jonny/) if there is something you
-need to do that you don't understand. You might also be able to dig up some information [on the old
-Thrax page](http://cs.jhu.edu/~jonny/thrax/).
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/tms.html
----------------------------------------------------------------------
diff --git a/4.0/tms.html b/4.0/tms.html
new file mode 100644
index 0000000..1e38df8
--- /dev/null
+++ b/4.0/tms.html
@@ -0,0 +1,377 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+ <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" />
+ <title>Joshua | Building Translation Models</title>
+ </head>
+
+ <body>
+
+ <div id="navbar">
+ <a href="http://joshua-decoder.org/">
+ <img src="../images/joshua-logo-small.png" width="130px"
+ alt="Joshua logo (picture of a Joshua tree)" />
+ </a>
+
+ <p class="infobox">
+ <b>Stable version</b><br />
+ 4.1<br/><br/>
+ <b>Release date</b><br />
+ 2013 January
+ </p>
+
+<!-- <div class="infobox"> -->
+<!-- <b>AUTO LINKS</b><br/> -->
+<!-- <ul> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Building a language pack</li> -->
+<!-- -->
+<!-- <li> Building a language pack</li> -->
+<!-- -->
+<!-- <li> Bundling a configuration</li> -->
+<!-- -->
+<!-- <li> Contributors</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Frequently Asked Questions</li> -->
+<!-- -->
+<!-- <li> Common problems</li> -->
+<!-- -->
+<!-- <li> Frequently Asked Questions</li> -->
+<!-- -->
+<!-- <li> Common problems</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> -->
+<!-- -->
+<!-- <li> Indian Languages Parallel Corpora</li> -->
+<!-- -->
+<!-- <li> Joshua 4.0 User Documentation</li> -->
+<!-- -->
+<!-- <li> Language packs</li> -->
+<!-- -->
+<!-- <li> Paraphrase Packs</li> -->
+<!-- -->
+<!-- <li> Joshua releases</li> -->
+<!-- -->
+<!-- <li> Support</li> -->
+<!-- -->
+<!-- <li> Getting Started</li> -->
+<!-- -->
+<!-- <li> Welcome to Joshua</li> -->
+<!-- -->
+<!-- <li> Joshua documentation</li> -->
+<!-- -->
+<!-- <li> Joshua documentation</li> -->
+<!-- -->
+<!-- <li> Installation</li> -->
+<!-- -->
+<!-- <li> Installation</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Lattice decoding</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> Quick Start</li> -->
+<!-- -->
+<!-- <li> Quick Start</li> -->
+<!-- -->
+<!-- <li> Releases</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Installing and running the Joshua Decoder</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> What's New</li> -->
+<!-- -->
+<!-- <li> What's New</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- </ul> -->
+<!-- </div> -->
+
+ <div class="infobox">
+
+ <b>Links</b><br />
+ <ul>
+ <li> <a href="../index.html">Main</a> </li>
+ <li> <a href="pipeline.html">Pipeline</a> </li>
+ <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li>
+ <li> <a href="decoder.html">Decoder</a> </li>
+ <li> <a href="server.html">Decoder Server</a> </li>
+ <li> <a href="file-formats.html">File formats</a> </li>
+ <li> <a href="thrax.html">Grammar Extraction</a> </li>
+ <li> <a href="../releases.html">Releases</a> </li>
+ </ul>
+ </div>
+
+ <div class="infobox">
+ <b>Advanced</b><br />
+ <ul>
+<!-- <li> <a href="packing.html">Grammar packing</a> </li> -->
+ <li> <a href="large-lms.html">Building large LMs</a> </li>
+ <li> <a href="zmert.html">Running Z-MERT</a> </li>
+ <li> <a href="lattice.html">Lattices</a> </li>
+ <li> <a href="server.html">TCP/IP server</a> </li>
+ <li> <a href="bundle.html">Bundled configuration</a> </li>
+ </ul>
+ </div>
+
+ <div class="infobox">
+ <b>Help</b><br />
+ <ul>
+ <li> <a href="faq.html">Answers</a> </li>
+ <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li>
+ </ul>
+ </div>
+
+ <div class="footer">
+ Last updated on April 08, 2016
+ </div>
+
+ </div>
+
+ <div id="main">
+ <div id="title">
+ <h1>Building Translation Models</h1>
+ </div>
+
+ <div id="content">
+
+ <h1 id="build-a-translation-model">Build a translation model</h1>
+
+<p>Extracting a grammar from a large amount of data is a multi-step process. The first requirement is parallel data. The Europarl, Call Home, and Fisher corpora all contain parallel translations of Spanish and English sentences.</p>
+
+<p>We will copy (or symlink) the parallel source text files in a subdirectory called <code class="highlighter-rouge">input/</code>.</p>
+
+<p>Then, we concatenate all the training files on each side. The pipeline script normally does tokenization and normalization, but in this instance we have a custom tokenizer we need to apply to the source side, so we have to do it manually and then skip that step using the <code class="highlighter-rouge">pipeline.pl</code> option <code class="highlighter-rouge">--first-step alignment</code>.</p>
+
+<ul>
+ <li>
+ <p>to tokenize the English data, do</p>
+
+ <table>
+ <tbody>
+ <tr>
+ <td>cat callhome.en europarl.en fisher.en > all.en</td>
+ <td>$JOSHUA/scripts/training/normalize-punctuation.pl en</td>
+ <td>$JOSHUA/scripts/training/penn-treebank-tokenizer.perl</td>
+ <td>$JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.en</td>
+ </tr>
+ </tbody>
+ </table>
+ </li>
+</ul>
+
+<p>The same can be done for the Spanish side of the input data:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>cat callhome.es europarl.es fisher.es > all.es | $JOSHUA/scripts/training/normalize-punctuation.pl es | $JOSHUA/scripts/training/penn-treebank-tokenizer.perl | $JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.es
+</code></pre>
+</div>
+
+<p>By the way, an alternative tokenizer is a Twitter tokenizer found in the <a href="http://github.com/vandurme/jerboa">Jerboa</a> project.</p>
+
+<p>The final step in the training data preparation is to remove all examples in which either of the language sides is a blank line.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>paste all.norm.tok.lc.es all.norm.tok.lc.en | grep -Pv "^\t|\t$" \
+ | ./splittabs.pl all.norm.tok.lc.noblanks.es all.norm.tok.lc.noblanks.en
+</code></pre>
+</div>
+
+<p>contents of <code class="highlighter-rouge">splittabls.pl</code> by Matt Post:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code><span class="c1">#!/usr/bin/perl</span>
+
+<span class="c1"># splits on tab, printing respective chunks to the list of files given</span>
+<span class="c1"># as script arguments</span>
+
+<span class="k">use</span> <span class="nv">FileHandle</span><span class="p">;</span>
+
+<span class="k">my</span> <span class="nv">@fh</span><span class="p">;</span>
+<span class="vg">$|</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> <span class="c1"># don't buffer output</span>
+
+<span class="k">if</span> <span class="p">(</span><span class="nv">@ARGV</span> <span class="o"><</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span>
+ <span class="k">print</span> <span class="s">"Usage: splittabs.pl < tabbed-file\n"</span><span class="p">;</span>
+ <span class="nb">exit</span><span class="p">;</span>
+<span class="p">}</span>
+
+<span class="k">my</span> <span class="nv">@fh</span> <span class="o">=</span> <span class="nb">map</span> <span class="p">{</span> <span class="nv">get_filehandle</span><span class="p">(</span><span class="nv">$_</span><span class="p">)</span> <span class="p">}</span> <span class="nv">@ARGV</span><span class="p">;</span>
+<span class="nv">@ARGV</span> <span class="o">=</span> <span class="p">();</span>
+
+<span class="k">while</span> <span class="p">(</span><span class="k">my</span> <span class="nv">$line</span> <span class="o">=</span> <span class="o"><></span><span class="p">)</span> <span class="p">{</span>
+ <span class="nb">chomp</span><span class="p">(</span><span class="nv">$line</span><span class="p">);</span>
+ <span class="k">my</span> <span class="p">(</span><span class="nv">@fields</span><span class="p">)</span> <span class="o">=</span> <span class="nb">split</span><span class="p">(</span><span class="sr">/\t/</span><span class="p">,</span><span class="nv">$line</span><span class="p">,</span><span class="nb">scalar</span> <span class="nv">@fh</span><span class="p">);</span>
+
+ <span class="nb">map</span> <span class="p">{</span> <span class="k">print</span> <span class="p">{</span><span class="nv">$fh</span><span class="p">[</span><span class="nv">$_</span><span class="p">]}</span> <span class="s">"$fields[$_]\n"</span> <span class="p">}</span> <span class="p">(</span><span class="mi">0</span><span class="o">..</span><span class="nv">$#fields</span><span class="p">);</span>
+<span class="p">}</span>
+
+<span class="k">sub </span><span class="nf">get_filehandle</span> <span class="p">{</span>
+ <span class="k">my</span> <span class="nv">$file</span> <span class="o">=</span> <span class="nb">shift</span><span class="p">;</span>
+
+ <span class="k">if</span> <span class="p">(</span><span class="nv">$file</span> <span class="ow">eq</span> <span class="s">"-"</span><span class="p">)</span> <span class="p">{</span>
+ <span class="k">return</span> <span class="o">*</span><span class="bp">STDOUT</span><span class="p">;</span>
+ <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
+ <span class="nb">local</span> <span class="o">*</span><span class="nv">FH</span><span class="p">;</span>
+ <span class="nb">open</span> <span class="nv">FH</span><span class="p">,</span> <span class="s">">$file"</span> <span class="ow">or</span> <span class="nb">die</span> <span class="s">"can't open '$file' for writing"</span><span class="p">;</span>
+ <span class="k">return</span> <span class="o">*</span><span class="nv">FH</span><span class="p">;</span>
+ <span class="p">}</span>
+<span class="p">}</span>
+</code></pre>
+</div>
+
+<p>Now we can run the pipeline to extract the grammar. Run the following script:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code><span class="c">#!/bin/bash</span>
+
+<span class="c"># this creates a grammar</span>
+
+<span class="c"># NEED:</span>
+<span class="c"># pair</span>
+<span class="c"># type</span>
+
+<span class="nb">set</span> -u
+
+<span class="nv">pair</span><span class="o">=</span>es-en
+<span class="nb">type</span><span class="o">=</span>hiero
+
+<span class="c">#. ~/.bashrc</span>
+
+<span class="c">#basedir=$(pwd)</span>
+
+<span class="nv">dir</span><span class="o">=</span>grammar-<span class="nv">$pair</span>-<span class="nv">$type</span>
+
+<span class="o">[[</span> ! -d <span class="nv">$dir</span> <span class="o">]]</span> <span class="o">&&</span> mkdir -p <span class="nv">$dir</span>
+<span class="nb">cd</span> <span class="nv">$dir</span>
+
+<span class="nb">source</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span> <span class="nv">$pair</span> | cut -d- -f 1<span class="k">)</span>
+<span class="nv">target</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span> <span class="nv">$pair</span> | cut -d- -f 2<span class="k">)</span>
+
+<span class="nv">$JOSHUA</span>/scripts/training/pipeline.pl <span class="se">\</span>
+ --source <span class="nv">$source</span> <span class="se">\</span>
+ --target <span class="nv">$target</span> <span class="se">\</span>
+ --corpus /home/hltcoe/lorland/expts/scale12/model1/input/all.norm.tok.lc.noblanks <span class="se">\</span>
+ --type <span class="nv">$type</span> <span class="se">\</span>
+ --joshua-mem 100g <span class="se">\</span>
+ --no-prepare <span class="se">\</span>
+ --first-step align <span class="se">\</span>
+ --last-step thrax <span class="se">\</span>
+ --hadoop <span class="nv">$HADOOP</span> <span class="se">\</span>
+ --threads 8 <span class="se">\</span>
+</code></pre>
+</div>
+
+
+ </div>
+ </div>
+
+ </body>
+</html>
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/tms.md
----------------------------------------------------------------------
diff --git a/4.0/tms.md b/4.0/tms.md
deleted file mode 100644
index a86a311..0000000
--- a/4.0/tms.md
+++ /dev/null
@@ -1,106 +0,0 @@
----
-layout: default4
-category: advanced
-title: Building Translation Models
----
-
-# Build a translation model
-
-Extracting a grammar from a large amount of data is a multi-step process. The first requirement is parallel data. The Europarl, Call Home, and Fisher corpora all contain parallel translations of Spanish and English sentences.
-
-We will copy (or symlink) the parallel source text files in a subdirectory called `input/`.
-
-Then, we concatenate all the training files on each side. The pipeline script normally does tokenization and normalization, but in this instance we have a custom tokenizer we need to apply to the source side, so we have to do it manually and then skip that step using the `pipeline.pl` option `--first-step alignment`.
-
-* to tokenize the English data, do
-
- cat callhome.en europarl.en fisher.en > all.en | $JOSHUA/scripts/training/normalize-punctuation.pl en | $JOSHUA/scripts/training/penn-treebank-tokenizer.perl | $JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.en
-
-The same can be done for the Spanish side of the input data:
-
- cat callhome.es europarl.es fisher.es > all.es | $JOSHUA/scripts/training/normalize-punctuation.pl es | $JOSHUA/scripts/training/penn-treebank-tokenizer.perl | $JOSHUA/scripts/lowercase.perl > all.norm.tok.lc.es
-
-By the way, an alternative tokenizer is a Twitter tokenizer found in the [Jerboa](http://github.com/vandurme/jerboa) project.
-
-The final step in the training data preparation is to remove all examples in which either of the language sides is a blank line.
-
- paste all.norm.tok.lc.es all.norm.tok.lc.en | grep -Pv "^\t|\t$" \
- | ./splittabs.pl all.norm.tok.lc.noblanks.es all.norm.tok.lc.noblanks.en
-
-contents of `splittabls.pl` by Matt Post:
-
- #!/usr/bin/perl
-
- # splits on tab, printing respective chunks to the list of files given
- # as script arguments
-
- use FileHandle;
-
- my @fh;
- $| = 1; # don't buffer output
-
- if (@ARGV < 0) {
- print "Usage: splittabs.pl < tabbed-file\n";
- exit;
- }
-
- my @fh = map { get_filehandle($_) } @ARGV;
- @ARGV = ();
-
- while (my $line = <>) {
- chomp($line);
- my (@fields) = split(/\t/,$line,scalar @fh);
-
- map { print {$fh[$_]} "$fields[$_]\n" } (0..$#fields);
- }
-
- sub get_filehandle {
- my $file = shift;
-
- if ($file eq "-") {
- return *STDOUT;
- } else {
- local *FH;
- open FH, ">$file" or die "can't open '$file' for writing";
- return *FH;
- }
- }
-
-Now we can run the pipeline to extract the grammar. Run the following script:
-
- #!/bin/bash
-
- # this creates a grammar
-
- # NEED:
- # pair
- # type
-
- set -u
-
- pair=es-en
- type=hiero
-
- #. ~/.bashrc
-
- #basedir=$(pwd)
-
- dir=grammar-$pair-$type
-
- [[ ! -d $dir ]] && mkdir -p $dir
- cd $dir
-
- source=$(echo $pair | cut -d- -f 1)
- target=$(echo $pair | cut -d- -f 2)
-
- $JOSHUA/scripts/training/pipeline.pl \
- --source $source \
- --target $target \
- --corpus /home/hltcoe/lorland/expts/scale12/model1/input/all.norm.tok.lc.noblanks \
- --type $type \
- --joshua-mem 100g \
- --no-prepare \
- --first-step align \
- --last-step thrax \
- --hadoop $HADOOP \
- --threads 8 \
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/zmert.html
----------------------------------------------------------------------
diff --git a/4.0/zmert.html b/4.0/zmert.html
new file mode 100644
index 0000000..a589161
--- /dev/null
+++ b/4.0/zmert.html
@@ -0,0 +1,339 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+ <link rel="stylesheet" type="text/css" media="screen,print" href="../joshua4.css" />
+ <title>Joshua | Z-MERT</title>
+ </head>
+
+ <body>
+
+ <div id="navbar">
+ <a href="http://joshua-decoder.org/">
+ <img src="../images/joshua-logo-small.png" width="130px"
+ alt="Joshua logo (picture of a Joshua tree)" />
+ </a>
+
+ <p class="infobox">
+ <b>Stable version</b><br />
+ 4.1<br/><br/>
+ <b>Release date</b><br />
+ 2013 January
+ </p>
+
+<!-- <div class="infobox"> -->
+<!-- <b>AUTO LINKS</b><br/> -->
+<!-- <ul> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Advanced features</li> -->
+<!-- -->
+<!-- <li> Building a language pack</li> -->
+<!-- -->
+<!-- <li> Building a language pack</li> -->
+<!-- -->
+<!-- <li> Bundling a configuration</li> -->
+<!-- -->
+<!-- <li> Contributors</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Decoder configuration parameters</li> -->
+<!-- -->
+<!-- <li> Frequently Asked Questions</li> -->
+<!-- -->
+<!-- <li> Common problems</li> -->
+<!-- -->
+<!-- <li> Frequently Asked Questions</li> -->
+<!-- -->
+<!-- <li> Common problems</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Features</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> Joshua file formats</li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> Fisher and CALLHOME Spanish English Speech Translation Corpus</li> -->
+<!-- -->
+<!-- <li> Indian Languages Parallel Corpora</li> -->
+<!-- -->
+<!-- <li> Joshua 4.0 User Documentation</li> -->
+<!-- -->
+<!-- <li> Language packs</li> -->
+<!-- -->
+<!-- <li> Paraphrase Packs</li> -->
+<!-- -->
+<!-- <li> Joshua releases</li> -->
+<!-- -->
+<!-- <li> Support</li> -->
+<!-- -->
+<!-- <li> Getting Started</li> -->
+<!-- -->
+<!-- <li> Welcome to Joshua</li> -->
+<!-- -->
+<!-- <li> Joshua documentation</li> -->
+<!-- -->
+<!-- <li> Joshua documentation</li> -->
+<!-- -->
+<!-- <li> Installation</li> -->
+<!-- -->
+<!-- <li> Installation</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Alignment with Jacana</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Building large LMs with SRILM</li> -->
+<!-- -->
+<!-- <li> Lattice decoding</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> Grammar Packing</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> The Joshua Pipeline</li> -->
+<!-- -->
+<!-- <li> Quick Start</li> -->
+<!-- -->
+<!-- <li> Quick Start</li> -->
+<!-- -->
+<!-- <li> Releases</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Server mode</li> -->
+<!-- -->
+<!-- <li> Installing and running the Joshua Decoder</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Grammar extraction with Thrax</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Building Translation Models</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> Pipeline tutorial</li> -->
+<!-- -->
+<!-- <li> What's New</li> -->
+<!-- -->
+<!-- <li> What's New</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> Z-MERT</li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- <li> </li> -->
+<!-- -->
+<!-- </ul> -->
+<!-- </div> -->
+
+ <div class="infobox">
+
+ <b>Links</b><br />
+ <ul>
+ <li> <a href="../index.html">Main</a> </li>
+ <li> <a href="pipeline.html">Pipeline</a> </li>
+ <li> <a href="step-by-step-instructions.html">Manual walkthrough</a> </li>
+ <li> <a href="decoder.html">Decoder</a> </li>
+ <li> <a href="server.html">Decoder Server</a> </li>
+ <li> <a href="file-formats.html">File formats</a> </li>
+ <li> <a href="thrax.html">Grammar Extraction</a> </li>
+ <li> <a href="../releases.html">Releases</a> </li>
+ </ul>
+ </div>
+
+ <div class="infobox">
+ <b>Advanced</b><br />
+ <ul>
+<!-- <li> <a href="packing.html">Grammar packing</a> </li> -->
+ <li> <a href="large-lms.html">Building large LMs</a> </li>
+ <li> <a href="zmert.html">Running Z-MERT</a> </li>
+ <li> <a href="lattice.html">Lattices</a> </li>
+ <li> <a href="server.html">TCP/IP server</a> </li>
+ <li> <a href="bundle.html">Bundled configuration</a> </li>
+ </ul>
+ </div>
+
+ <div class="infobox">
+ <b>Help</b><br />
+ <ul>
+ <li> <a href="faq.html">Answers</a> </li>
+ <li> <a href="https://groups.google.com/d/forum/joshua_support">Archive</a> </li>
+ </ul>
+ </div>
+
+ <div class="footer">
+ Last updated on April 08, 2016
+ </div>
+
+ </div>
+
+ <div id="main">
+ <div id="title">
+ <h1>Z-MERT</h1>
+ </div>
+
+ <div id="content">
+
+ <p>This document describes how to manually run the ZMERT module. ZMERT is Joshua’s minimum error-rate
+training module, written by Omar F. Zaidan. It is easily adapted to drop in different decoders, and
+was also written so as to work with different objective functions (other than BLEU).</p>
+
+<p>((Section (1) in <code class="highlighter-rouge">$JOSHUA/examples/ZMERT/README_ZMERT.txt</code> is an expanded version of this section))</p>
+
+<p>Z-MERT, can be used by launching the driver program (<code class="highlighter-rouge">ZMERT.java</code>), which expects a config file as
+its main argument. This config file can be used to specify any subset of Z-MERT’s 20-some
+parameters. For a full list of those parameters, and their default values, run ZMERT with a single
+-h argument as follows:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>java -cp $JOSHUA/bin joshua.zmert.ZMERT -h
+</code></pre>
+</div>
+
+<p>So what does a Z-MERT config file look like?</p>
+
+<p>Examine the file <code class="highlighter-rouge">examples/ZMERT/ZMERT_config_ex2.txt</code>. You will find that it
+specifies the following “main” MERT parameters:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>(*) -dir dirPrefix: working directory
+(*) -s sourceFile: source sentences (foreign sentences) of the MERT dataset
+(*) -r refFile: target sentences (reference translations) of the MERT dataset
+(*) -rps refsPerSen: number of reference translations per sentence
+(*) -p paramsFile: file containing parameter names, initial values, and ranges
+(*) -maxIt maxMERTIts: maximum number of MERT iterations
+(*) -ipi initsPerIt: number of intermediate initial points per iteration
+(*) -cmd commandFile: name of file containing commands to run the decoder
+(*) -decOut decoderOutFile: name of the output file produced by the decoder
+(*) -dcfg decConfigFile: name of decoder config file
+(*) -N N: size of N-best list (per sentence) generated in each MERT iteration
+(*) -v verbosity: output verbosity level (0-2; higher value => more verbose)
+(*) -seed seed: seed used to initialize the random number generator
+</code></pre>
+</div>
+
+<p>(Note that the <code class="highlighter-rouge">-s</code> parameter is only used if Z-MERT is running Joshua as an
+ internal decoder. If Joshua is run as an external decoder, as is the case in
+ this README, then this parameter is ignored.)</p>
+
+<p>To test Z-MERT on the 100-sentence test set of example2, provide this config
+file to Z-MERT as follows:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>java -cp bin joshua.zmert.ZMERT -maxMem 500 examples/ZMERT/ZMERT_config_ex2.txt > examples/ZMERT/ZMERT_example/ZMERT.out
+</code></pre>
+</div>
+
+<p>This will run Z-MERT for a couple of iterations on the data from the example2
+folder. (Notice that we have made copies of the source and reference files
+from example2 and renamed them as src.txt and ref.* in the MERT_example folder,
+just to have all the files needed by Z-MERT in one place.) Once the Z-MERT run
+is complete, you should be able to inspect the log file to see what kinds of
+things it did. If everything goes well, the run should take a few minutes, of
+which more than 95% is time spent by Z-MERT waiting on Joshua to finish
+decoding the sentences (once per iteration).</p>
+
+<p>The output file you get should be equivalent to <code class="highlighter-rouge">ZMERT.out.verbosity1</code>. If you
+rerun the experiment with the verbosity (-v) argument set to 2 instead of 1,
+the output file you get should be equivalent to <code class="highlighter-rouge">ZMERT.out.verbosity2</code>, which has
+more interesting details about what Z-MERT does.</p>
+
+<p>Notice the additional <code class="highlighter-rouge">-maxMem</code> argument. It tells Z-MERT that it should not
+persist to use up memory while the decoder is running (during which time Z-MERT
+would be idle). The 500 tells Z-MERT that it can only use a maximum of 500 MB.
+For more details on this issue, see section (4) in Z-MERT’s README.</p>
+
+<p>A quick note about Z-MERT’s interaction with the decoder. If you examine the
+file <code class="highlighter-rouge">decoder_command_ex2.txt</code>, which is provided as the commandFile (<code class="highlighter-rouge">-cmd</code>)
+argument in Z-MERT’s config file, you’ll find it contains the command one would
+use to run the decoder. Z-MERT launches the commandFile as an external
+process, and assumes that it will launch the decoder to produce translations.
+(Make sure that commandFile is executable.) After launching this external
+process, Z-MERT waits for it to finish, then uses the resulting output file for
+parameter tuning (in addition to the output files from previous iterations).
+The command file here only has a single command, but your command file could
+have multiple lines. Just make sure the command file itself is executable.</p>
+
+<p>Notice that the Z-MERT arguments <code class="highlighter-rouge">configFile</code> and <code class="highlighter-rouge">decoderOutFile</code> (<code class="highlighter-rouge">-cfg</code> and
+<code class="highlighter-rouge">-decOut</code>) must match the two Joshua arguments in the commandFile’s (<code class="highlighter-rouge">-cmd</code>) single
+command. Also, the Z-MERT argument for N must match the value for <code class="highlighter-rouge">top_n</code> in
+Joshua’s config file, indicated by the Z-MERT argument configFile (<code class="highlighter-rouge">-cfg</code>).</p>
+
+<p>For more details on Z-MERT, refer to <code class="highlighter-rouge">$JOSHUA/examples/ZMERT/README_ZMERT.txt</code></p>
+
+
+ </div>
+ </div>
+
+ </body>
+</html>
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/4.0/zmert.md
----------------------------------------------------------------------
diff --git a/4.0/zmert.md b/4.0/zmert.md
deleted file mode 100644
index 538a2ac..0000000
--- a/4.0/zmert.md
+++ /dev/null
@@ -1,83 +0,0 @@
----
-layout: default4
-category: advanced
-title: Z-MERT
----
-
-This document describes how to manually run the ZMERT module. ZMERT is Joshua's minimum error-rate
-training module, written by Omar F. Zaidan. It is easily adapted to drop in different decoders, and
-was also written so as to work with different objective functions (other than BLEU).
-
-((Section (1) in `$JOSHUA/examples/ZMERT/README_ZMERT.txt` is an expanded version of this section))
-
-Z-MERT, can be used by launching the driver program (`ZMERT.java`), which expects a config file as
-its main argument. This config file can be used to specify any subset of Z-MERT's 20-some
-parameters. For a full list of those parameters, and their default values, run ZMERT with a single
--h argument as follows:
-
- java -cp $JOSHUA/bin joshua.zmert.ZMERT -h
-
-So what does a Z-MERT config file look like?
-
-Examine the file `examples/ZMERT/ZMERT_config_ex2.txt`. You will find that it
-specifies the following "main" MERT parameters:
-
- (*) -dir dirPrefix: working directory
- (*) -s sourceFile: source sentences (foreign sentences) of the MERT dataset
- (*) -r refFile: target sentences (reference translations) of the MERT dataset
- (*) -rps refsPerSen: number of reference translations per sentence
- (*) -p paramsFile: file containing parameter names, initial values, and ranges
- (*) -maxIt maxMERTIts: maximum number of MERT iterations
- (*) -ipi initsPerIt: number of intermediate initial points per iteration
- (*) -cmd commandFile: name of file containing commands to run the decoder
- (*) -decOut decoderOutFile: name of the output file produced by the decoder
- (*) -dcfg decConfigFile: name of decoder config file
- (*) -N N: size of N-best list (per sentence) generated in each MERT iteration
- (*) -v verbosity: output verbosity level (0-2; higher value => more verbose)
- (*) -seed seed: seed used to initialize the random number generator
-
-(Note that the `-s` parameter is only used if Z-MERT is running Joshua as an
- internal decoder. If Joshua is run as an external decoder, as is the case in
- this README, then this parameter is ignored.)
-
-To test Z-MERT on the 100-sentence test set of example2, provide this config
-file to Z-MERT as follows:
-
- java -cp bin joshua.zmert.ZMERT -maxMem 500 examples/ZMERT/ZMERT_config_ex2.txt > examples/ZMERT/ZMERT_example/ZMERT.out
-
-This will run Z-MERT for a couple of iterations on the data from the example2
-folder. (Notice that we have made copies of the source and reference files
-from example2 and renamed them as src.txt and ref.* in the MERT_example folder,
-just to have all the files needed by Z-MERT in one place.) Once the Z-MERT run
-is complete, you should be able to inspect the log file to see what kinds of
-things it did. If everything goes well, the run should take a few minutes, of
-which more than 95% is time spent by Z-MERT waiting on Joshua to finish
-decoding the sentences (once per iteration).
-
-The output file you get should be equivalent to `ZMERT.out.verbosity1`. If you
-rerun the experiment with the verbosity (-v) argument set to 2 instead of 1,
-the output file you get should be equivalent to `ZMERT.out.verbosity2`, which has
-more interesting details about what Z-MERT does.
-
-Notice the additional `-maxMem` argument. It tells Z-MERT that it should not
-persist to use up memory while the decoder is running (during which time Z-MERT
-would be idle). The 500 tells Z-MERT that it can only use a maximum of 500 MB.
-For more details on this issue, see section (4) in Z-MERT's README.
-
-A quick note about Z-MERT's interaction with the decoder. If you examine the
-file `decoder_command_ex2.txt`, which is provided as the commandFile (`-cmd`)
-argument in Z-MERT's config file, you'll find it contains the command one would
-use to run the decoder. Z-MERT launches the commandFile as an external
-process, and assumes that it will launch the decoder to produce translations.
-(Make sure that commandFile is executable.) After launching this external
-process, Z-MERT waits for it to finish, then uses the resulting output file for
-parameter tuning (in addition to the output files from previous iterations).
-The command file here only has a single command, but your command file could
-have multiple lines. Just make sure the command file itself is executable.
-
-Notice that the Z-MERT arguments `configFile` and `decoderOutFile` (`-cfg` and
-`-decOut`) must match the two Joshua arguments in the commandFile's (`-cmd`) single
-command. Also, the Z-MERT argument for N must match the value for `top_n` in
-Joshua's config file, indicated by the Z-MERT argument configFile (`-cfg`).
-
-For more details on Z-MERT, refer to `$JOSHUA/examples/ZMERT/README_ZMERT.txt`
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/advanced.html
----------------------------------------------------------------------
diff --git a/5.0/advanced.html b/5.0/advanced.html
new file mode 100644
index 0000000..ad963e7
--- /dev/null
+++ b/5.0/advanced.html
@@ -0,0 +1,170 @@
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>Joshua Documentation | Advanced features</title>
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <meta name="description" content="">
+ <meta name="author" content="">
+
+ <!-- Le styles -->
+ <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+ <style>
+ body {
+ padding-top: 60px; /* 60px to make the container go all the way to the bottom of the topbar */
+ }
+ #download {
+ background-color: green;
+ font-size: 14pt;
+ font-weight: bold;
+ text-align: center;
+ color: white;
+ border-radius: 5px;
+ padding: 4px;
+ }
+
+ #download a:link {
+ color: white;
+ }
+
+ #download a:hover {
+ color: lightgrey;
+ }
+
+ #download a:visited {
+ color: white;
+ }
+
+ a.pdf {
+ font-variant: small-caps;
+ /* font-weight: bold; */
+ font-size: 10pt;
+ color: white;
+ background: brown;
+ padding: 2px;
+ }
+
+ a.bibtex {
+ font-variant: small-caps;
+ /* font-weight: bold; */
+ font-size: 10pt;
+ color: white;
+ background: orange;
+ padding: 2px;
+ }
+
+ img.sponsor {
+ height: 120px;
+ margin: 5px;
+ }
+ </style>
+ <link href="bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+
+ <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
+ <!--[if lt IE 9]>
+ <script src="bootstrap/js/html5shiv.js"></script>
+ <![endif]-->
+
+ <!-- Fav and touch icons -->
+ <link rel="apple-touch-icon-precomposed" sizes="144x144" href="bootstrap/ico/apple-touch-icon-144-precomposed.png">
+ <link rel="apple-touch-icon-precomposed" sizes="114x114" href="bootstrap/ico/apple-touch-icon-114-precomposed.png">
+ <link rel="apple-touch-icon-precomposed" sizes="72x72" href="bootstrap/ico/apple-touch-icon-72-precomposed.png">
+ <link rel="apple-touch-icon-precomposed" href="bootstrap/ico/apple-touch-icon-57-precomposed.png">
+ <link rel="shortcut icon" href="bootstrap/ico/favicon.png">
+ </head>
+
+ <body>
+
+ <div class="navbar navbar-inverse navbar-fixed-top">
+ <div class="navbar-inner">
+ <div class="container">
+ <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+ <span class="icon-bar"></span>
+ <span class="icon-bar"></span>
+ <span class="icon-bar"></span>
+ </button>
+ <a class="brand" href="/">Joshua</a>
+ <div class="nav-collapse collapse">
+ <ul class="nav">
+ <li><a href="index.html">Documentation</a></li>
+ <li><a href="pipeline.html">Pipeline</a></li>
+ <li><a href="tutorial.html">Tutorial</a></li>
+ <li><a href="decoder.html">Decoder</a></li>
+ <li><a href="thrax.html">Thrax</a></li>
+ <li><a href="file-formats.html">File formats</a></li>
+ <!-- <li><a href="advanced.html">Advanced</a></li> -->
+ <li><a href="faq.html">FAQ</a></li>
+ </ul>
+ </div><!--/.nav-collapse -->
+ </div>
+ </div>
+ </div>
+
+ <div class="container">
+
+ <div class="row">
+ <div class="span2">
+ <img src="/images/joshua-logo-small.png"
+ alt="Joshua logo (picture of a Joshua tree)" />
+ </div>
+ <div class="span10">
+ <h1>Joshua Documentation</h1>
+ <h2>Advanced features</h2>
+ <span id="download">
+ <a href="http://cs.jhu.edu/~post/files/joshua-v5.0.tgz">Download</a>
+ </span>
+ (version 5.0, released 16 August 2013)
+ </div>
+ </div>
+
+ <hr />
+
+ <div class="row">
+ <div class="span8">
+
+
+
+
+ </div>
+ </div>
+ </div> <!-- /container -->
+
+ <!-- Le javascript
+ ================================================== -->
+ <!-- Placed at the end of the document so the pages load faster -->
+ <script src="bootstrap/js/jquery.js"></script>
+ <script src="bootstrap/js/bootstrap-transition.js"></script>
+ <script src="bootstrap/js/bootstrap-alert.js"></script>
+ <script src="bootstrap/js/bootstrap-modal.js"></script>
+ <script src="bootstrap/js/bootstrap-dropdown.js"></script>
+ <script src="bootstrap/js/bootstrap-scrollspy.js"></script>
+ <script src="bootstrap/js/bootstrap-tab.js"></script>
+ <script src="bootstrap/js/bootstrap-tooltip.js"></script>
+ <script src="bootstrap/js/bootstrap-popover.js"></script>
+ <script src="bootstrap/js/bootstrap-button.js"></script>
+ <script src="bootstrap/js/bootstrap-collapse.js"></script>
+ <script src="bootstrap/js/bootstrap-carousel.js"></script>
+ <script src="bootstrap/js/bootstrap-typeahead.js"></script>
+
+ <!-- Start of StatCounter Code for Default Guide -->
+ <script type="text/javascript">
+ var sc_project=8264132;
+ var sc_invisible=1;
+ var sc_security="4b97fe2d";
+ </script>
+ <script type="text/javascript" src="http://www.statcounter.com/counter/counter.js"></script>
+ <noscript>
+ <div class="statcounter">
+ <a title="hit counter joomla"
+ href="http://statcounter.com/joomla/"
+ target="_blank">
+ <img class="statcounter"
+ src="http://c.statcounter.com/8264132/0/4b97fe2d/1/"
+ alt="hit counter joomla" />
+ </a>
+ </div>
+ </noscript>
+ <!-- End of StatCounter Code for Default Guide -->
+
+ </body>
+</html>
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/advanced.md
----------------------------------------------------------------------
diff --git a/5.0/advanced.md b/5.0/advanced.md
deleted file mode 100644
index 174041e..0000000
--- a/5.0/advanced.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: default
-category: links
-title: Advanced features
----
-
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/bundle.html
----------------------------------------------------------------------
diff --git a/5.0/bundle.html b/5.0/bundle.html
new file mode 100644
index 0000000..4977a8f
--- /dev/null
+++ b/5.0/bundle.html
@@ -0,0 +1,189 @@
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>Joshua Documentation | Bundling a configuration</title>
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <meta name="description" content="">
+ <meta name="author" content="">
+
+ <!-- Le styles -->
+ <link href="/bootstrap/css/bootstrap.css" rel="stylesheet">
+ <style>
+ body {
+ padding-top: 60px; /* 60px to make the container go all the way to the bottom of the topbar */
+ }
+ #download {
+ background-color: green;
+ font-size: 14pt;
+ font-weight: bold;
+ text-align: center;
+ color: white;
+ border-radius: 5px;
+ padding: 4px;
+ }
+
+ #download a:link {
+ color: white;
+ }
+
+ #download a:hover {
+ color: lightgrey;
+ }
+
+ #download a:visited {
+ color: white;
+ }
+
+ a.pdf {
+ font-variant: small-caps;
+ /* font-weight: bold; */
+ font-size: 10pt;
+ color: white;
+ background: brown;
+ padding: 2px;
+ }
+
+ a.bibtex {
+ font-variant: small-caps;
+ /* font-weight: bold; */
+ font-size: 10pt;
+ color: white;
+ background: orange;
+ padding: 2px;
+ }
+
+ img.sponsor {
+ height: 120px;
+ margin: 5px;
+ }
+ </style>
+ <link href="bootstrap/css/bootstrap-responsive.css" rel="stylesheet">
+
+ <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
+ <!--[if lt IE 9]>
+ <script src="bootstrap/js/html5shiv.js"></script>
+ <![endif]-->
+
+ <!-- Fav and touch icons -->
+ <link rel="apple-touch-icon-precomposed" sizes="144x144" href="bootstrap/ico/apple-touch-icon-144-precomposed.png">
+ <link rel="apple-touch-icon-precomposed" sizes="114x114" href="bootstrap/ico/apple-touch-icon-114-precomposed.png">
+ <link rel="apple-touch-icon-precomposed" sizes="72x72" href="bootstrap/ico/apple-touch-icon-72-precomposed.png">
+ <link rel="apple-touch-icon-precomposed" href="bootstrap/ico/apple-touch-icon-57-precomposed.png">
+ <link rel="shortcut icon" href="bootstrap/ico/favicon.png">
+ </head>
+
+ <body>
+
+ <div class="navbar navbar-inverse navbar-fixed-top">
+ <div class="navbar-inner">
+ <div class="container">
+ <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+ <span class="icon-bar"></span>
+ <span class="icon-bar"></span>
+ <span class="icon-bar"></span>
+ </button>
+ <a class="brand" href="/">Joshua</a>
+ <div class="nav-collapse collapse">
+ <ul class="nav">
+ <li><a href="index.html">Documentation</a></li>
+ <li><a href="pipeline.html">Pipeline</a></li>
+ <li><a href="tutorial.html">Tutorial</a></li>
+ <li><a href="decoder.html">Decoder</a></li>
+ <li><a href="thrax.html">Thrax</a></li>
+ <li><a href="file-formats.html">File formats</a></li>
+ <!-- <li><a href="advanced.html">Advanced</a></li> -->
+ <li><a href="faq.html">FAQ</a></li>
+ </ul>
+ </div><!--/.nav-collapse -->
+ </div>
+ </div>
+ </div>
+
+ <div class="container">
+
+ <div class="row">
+ <div class="span2">
+ <img src="/images/joshua-logo-small.png"
+ alt="Joshua logo (picture of a Joshua tree)" />
+ </div>
+ <div class="span10">
+ <h1>Joshua Documentation</h1>
+ <h2>Bundling a configuration</h2>
+ <span id="download">
+ <a href="http://cs.jhu.edu/~post/files/joshua-v5.0.tgz">Download</a>
+ </span>
+ (version 5.0, released 16 August 2013)
+ </div>
+ </div>
+
+ <hr />
+
+ <div class="row">
+ <div class="span8">
+
+ <p>A <em>bundled configuration</em> is a minimal set of configuration, resource, and script files. A script, <code class="highlighter-rouge">$JOSHUA/scripts/support/run-bundler.py</code> can be used to package up the run bundle. The resulting bundle can easily be transferred and shared.</p>
+
+<p><strong>Example invocation:</strong></p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>./run-bundler.py \
+ --force \
+ /path/to/rundir/runs/5/test/1/joshua.config \
+ /path/to/rundir/runs/5 \
+ bundled-configurations \
+ "-top-n 1 \
+ -output-format %S \
+ -mark-oovs false \
+ -server-port 5674 \
+ -tm/pt "thrax pt 20 /path/to/rundir/runs/5/test/1/grammar.gz"
+</code></pre>
+</div>
+
+<p>A new directory <code class="highlighter-rouge">./bundled-configurations</code> will be created, and all the bundled files will be copied or created in it. To use the configuration with Joshua, run the executable file <code class="highlighter-rouge">./bundled-configurations/bundle-runner.sh</code>.</p>
+
+<p>Note, the additional options between the pair of quotation marks are passed as arguments to the <code class="highlighter-rouge">$JOSHUA/scripts/copy-config.pl</code> script. That script has some special parameters, especially the <code class="highlighter-rouge">-tm/..</code> option.</p>
+
+
+ </div>
+ </div>
+ </div> <!-- /container -->
+
+ <!-- Le javascript
+ ================================================== -->
+ <!-- Placed at the end of the document so the pages load faster -->
+ <script src="bootstrap/js/jquery.js"></script>
+ <script src="bootstrap/js/bootstrap-transition.js"></script>
+ <script src="bootstrap/js/bootstrap-alert.js"></script>
+ <script src="bootstrap/js/bootstrap-modal.js"></script>
+ <script src="bootstrap/js/bootstrap-dropdown.js"></script>
+ <script src="bootstrap/js/bootstrap-scrollspy.js"></script>
+ <script src="bootstrap/js/bootstrap-tab.js"></script>
+ <script src="bootstrap/js/bootstrap-tooltip.js"></script>
+ <script src="bootstrap/js/bootstrap-popover.js"></script>
+ <script src="bootstrap/js/bootstrap-button.js"></script>
+ <script src="bootstrap/js/bootstrap-collapse.js"></script>
+ <script src="bootstrap/js/bootstrap-carousel.js"></script>
+ <script src="bootstrap/js/bootstrap-typeahead.js"></script>
+
+ <!-- Start of StatCounter Code for Default Guide -->
+ <script type="text/javascript">
+ var sc_project=8264132;
+ var sc_invisible=1;
+ var sc_security="4b97fe2d";
+ </script>
+ <script type="text/javascript" src="http://www.statcounter.com/counter/counter.js"></script>
+ <noscript>
+ <div class="statcounter">
+ <a title="hit counter joomla"
+ href="http://statcounter.com/joomla/"
+ target="_blank">
+ <img class="statcounter"
+ src="http://c.statcounter.com/8264132/0/4b97fe2d/1/"
+ alt="hit counter joomla" />
+ </a>
+ </div>
+ </noscript>
+ <!-- End of StatCounter Code for Default Guide -->
+
+ </body>
+</html>
http://git-wip-us.apache.org/repos/asf/incubator-joshua-site/blob/53cc3005/5.0/bundle.md
----------------------------------------------------------------------
diff --git a/5.0/bundle.md b/5.0/bundle.md
deleted file mode 100644
index c3874ab..0000000
--- a/5.0/bundle.md
+++ /dev/null
@@ -1,24 +0,0 @@
----
-layout: default
-category: links
-title: Bundling a configuration
----
-
-A *bundled configuration* is a minimal set of configuration, resource, and script files. A script, `$JOSHUA/scripts/support/run-bundler.py` can be used to package up the run bundle. The resulting bundle can easily be transferred and shared.
-
-**Example invocation:**
-
- ./run-bundler.py \
- --force \
- /path/to/rundir/runs/5/test/1/joshua.config \
- /path/to/rundir/runs/5 \
- bundled-configurations \
- "-top-n 1 \
- -output-format %S \
- -mark-oovs false \
- -server-port 5674 \
- -tm/pt "thrax pt 20 /path/to/rundir/runs/5/test/1/grammar.gz"
-
-A new directory `./bundled-configurations` will be created, and all the bundled files will be copied or created in it. To use the configuration with Joshua, run the executable file `./bundled-configurations/bundle-runner.sh`.
-
-Note, the additional options between the pair of quotation marks are passed as arguments to the `$JOSHUA/scripts/copy-config.pl` script. That script has some special parameters, especially the `-tm/..` option.