You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:34:05 UTC
[17/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/jam-files/engine/regexp.c
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/jam-files/engine/regexp.c b/ext/kenlm/jam-files/engine/regexp.c
deleted file mode 100644
index c64201b..0000000
--- a/ext/kenlm/jam-files/engine/regexp.c
+++ /dev/null
@@ -1,1329 +0,0 @@
-/*
- * regcomp and regexec -- regsub and regerror are elsewhere
- *
- * Copyright (c) 1986 by University of Toronto.
- * Written by Henry Spencer. Not derived from licensed software.
- *
- * Permission is granted to anyone to use this software for any
- * purpose on any computer system, and to redistribute it freely,
- * subject to the following restrictions:
- *
- * 1. The author is not responsible for the consequences of use of
- * this software, no matter how awful, even if they arise
- * from defects in it.
- *
- * 2. The origin of this software must not be misrepresented, either
- * by explicit claim or by omission.
- *
- * 3. Altered versions must be plainly marked as such, and must not
- * be misrepresented as being the original software.
- *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
- *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
- *** to assist in implementing egrep.
- *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
- *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
- *** as in BSD grep and ex.
- *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
- *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
- *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
- *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
- *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
- *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
- *** was moved into regexp.h, and the include of regexp.h now uses "'s
- *** to avoid conflicting with the system regexp.h. Const, bless its
- *** soul, was removed so it can compile everywhere. The declaration
- *** of strchr() was in conflict on AIX, so it was removed (as it is
- *** happily defined in string.h).
- *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
- *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
- *
- * Beware that some of this code is subtly aware of the way operator precedence
- * is structured in regular expressions. Serious changes in regular-expression
- * syntax might require a total rethink.
- */
-
-
-#include "jam.h"
-#include "regexp.h"
-
-#include <stdio.h>
-#include <ctype.h>
-#ifndef ultrix
-# include <stdlib.h>
-#endif
-#include <string.h>
-
-
-/*
- * The "internal use only" fields in regexp.h are present to pass info from
- * compile to execute that permits the execute phase to run lots faster on
- * simple cases. They are:
- :
- * regstart char that must begin a match; '\0' if none obvious.
- * reganch is the match anchored (at beginning-of-line only)?
- * regmust string (pointer into program) that match must include, or NULL.
- * regmlen length of regmust string.
- *
- * Regstart and reganch permit very fast decisions on suitable starting points
- * for a match, cutting down the work a lot. Regmust permits fast rejection of
- * lines that cannot possibly match. The regmust tests are costly enough that
- * regcomp() supplies a regmust only if the r.e. contains something potentially
- * expensive (at present, the only such thing detected is * or + at the start of
- * the r.e., which can involve a lot of backup). Regmlen is supplied because the
- * test in regexec() needs it and regcomp() is computing it anyway.
- */
-
-/*
- * Structure for regexp "program". This is essentially a linear encoding of a
- * nondeterministic finite-state machine (aka syntax charts or "railroad normal
- * form" in parsing technology). Each node is an opcode plus a "next" pointer,
- * possibly plus an operand. "Next" pointers of all nodes except BRANCH
- * implement concatenation; a "next" pointer with a BRANCH on both ends of it is
- * connecting two alternatives. [Here we have one of the subtle syntax
- * dependencies: an individual BRANCH, as opposed to a collection of them, is
- * never concatenated with anything because of operator precedence.] The operand
- * of some types of node is a literal string; for others, it is a node leading
- * into a sub-FSM. In particular, the operand of a BRANCH node is the first node
- * of the branch. [NB this is *not* a tree structure: the tail of the branch
- * connects to the thing following the set of BRANCHes.] The opcodes are:
- */
-
-/* definition number opnd? meaning */
-#define END 0 /* no End of program. */
-#define BOL 1 /* no Match "" at beginning of line. */
-#define EOL 2 /* no Match "" at end of line. */
-#define ANY 3 /* no Match any one character. */
-#define ANYOF 4 /* str Match any character in this string. */
-#define ANYBUT 5 /* str Match any character not in this string. */
-#define BRANCH 6 /* node Match this alternative, or the next... */
-#define BACK 7 /* no Match "", "next" ptr points backward. */
-#define EXACTLY 8 /* str Match this string. */
-#define NOTHING 9 /* no Match empty string. */
-#define STAR 10 /* node Match this (simple) thing 0 or more times. */
-#define PLUS 11 /* node Match this (simple) thing 1 or more times. */
-#define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
-#define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
-#define OPEN 20 /* no Mark this point in input as start of #n. */
- /* OPEN+1 is number 1, etc. */
-#define CLOSE 30 /* no Analogous to OPEN. */
-
-
-/*
- * Opcode notes:
- *
- * BRANCH The set of branches constituting a single choice are hooked
- * together with their "next" pointers, since precedence prevents
- * anything being concatenated to any individual branch. The
- * "next" pointer of the last BRANCH in a choice points to the
- * thing following the whole choice. This is also where the
- * final "next" pointer of each individual branch points; each
- * branch starts with the operand node of a BRANCH node.
- *
- * BACK Normal "next" pointers all implicitly point forward; BACK
- * exists to make loop structures possible.
- *
- * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
- * BRANCH structures using BACK. Simple cases (one character
- * per match) are implemented with STAR and PLUS for speed
- * and to minimize recursive plunges.
- *
- * OPEN,CLOSE ...are numbered at compile time.
- */
-
-/*
- * A node is one char of opcode followed by two chars of "next" pointer.
- * "Next" pointers are stored as two 8-bit pieces, high order first. The
- * value is a positive offset from the opcode of the node containing it.
- * An operand, if any, simply follows the node. (Note that much of the
- * code generation knows about this implicit relationship.)
- *
- * Using two bytes for the "next" pointer is vast overkill for most things,
- * but allows patterns to get big without disasters.
- */
-#define OP(p) (*(p))
-#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
-#define OPERAND(p) ((p) + 3)
-
-/*
- * See regmagic.h for one further detail of program structure.
- */
-
-
-/*
- * Utility definitions.
- */
-#ifndef CHARBITS
-#define UCHARAT(p) ((int)*(const unsigned char *)(p))
-#else
-#define UCHARAT(p) ((int)*(p)&CHARBITS)
-#endif
-
-#define FAIL(m) { regerror(m); return(NULL); }
-#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
-
-/*
- * Flags to be passed up and down.
- */
-#define HASWIDTH 01 /* Known never to match null string. */
-#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
-#define SPSTART 04 /* Starts with * or +. */
-#define WORST 0 /* Worst case. */
-
-/*
- * Global work variables for regcomp().
- */
-static char *regparse; /* Input-scan pointer. */
-static int regnpar; /* () count. */
-static char regdummy;
-static char *regcode; /* Code-emit pointer; ®dummy = don't. */
-static long regsize; /* Code size. */
-
-/*
- * Forward declarations for regcomp()'s friends.
- */
-#ifndef STATIC
-#define STATIC static
-#endif
-STATIC char *reg( int paren, int *flagp );
-STATIC char *regbranch( int *flagp );
-STATIC char *regpiece( int *flagp );
-STATIC char *regatom( int *flagp );
-STATIC char *regnode( int op );
-STATIC char *regnext( register char *p );
-STATIC void regc( int b );
-STATIC void reginsert( char op, char *opnd );
-STATIC void regtail( char *p, char *val );
-STATIC void regoptail( char *p, char *val );
-#ifdef STRCSPN
-STATIC int strcspn();
-#endif
-
-/*
- - regcomp - compile a regular expression into internal code
- *
- * We can't allocate space until we know how big the compiled form will be,
- * but we can't compile it (and thus know how big it is) until we've got a
- * place to put the code. So we cheat: we compile it twice, once with code
- * generation turned off and size counting turned on, and once "for real".
- * This also means that we don't allocate space until we are sure that the
- * thing really will compile successfully, and we never have to move the
- * code and thus invalidate pointers into it. (Note that it has to be in
- * one piece because free() must be able to free it all.)
- *
- * Beware that the optimization-preparation code in here knows about some
- * of the structure of the compiled regexp.
- */
-regexp *
-regcomp( const char *exp )
-{
- register regexp *r;
- register char *scan;
- register char *longest;
- register unsigned len;
- int flags;
-
- if (exp == NULL)
- FAIL("NULL argument");
-
- /* First pass: determine size, legality. */
-#ifdef notdef
- if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
-#endif
- regparse = (char *)exp;
- regnpar = 1;
- regsize = 0L;
- regcode = ®dummy;
- regc(MAGIC);
- if (reg(0, &flags) == NULL)
- return(NULL);
-
- /* Small enough for pointer-storage convention? */
- if (regsize >= 32767L) /* Probably could be 65535L. */
- FAIL("regexp too big");
-
- /* Allocate space. */
- r = (regexp *)BJAM_MALLOC(sizeof(regexp) + (unsigned)regsize);
- if (r == NULL)
- FAIL("out of space");
-
- /* Second pass: emit code. */
- regparse = (char *)exp;
- regnpar = 1;
- regcode = r->program;
- regc(MAGIC);
- if (reg(0, &flags) == NULL)
- return(NULL);
-
- /* Dig out information for optimizations. */
- r->regstart = '\0'; /* Worst-case defaults. */
- r->reganch = 0;
- r->regmust = NULL;
- r->regmlen = 0;
- scan = r->program+1; /* First BRANCH. */
- if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
- scan = OPERAND(scan);
-
- /* Starting-point info. */
- if (OP(scan) == EXACTLY)
- r->regstart = *OPERAND(scan);
- else if (OP(scan) == BOL)
- r->reganch++;
-
- /*
- * If there's something expensive in the r.e., find the
- * longest literal string that must appear and make it the
- * regmust. Resolve ties in favor of later strings, since
- * the regstart check works with the beginning of the r.e.
- * and avoiding duplication strengthens checking. Not a
- * strong reason, but sufficient in the absence of others.
- */
- if (flags&SPSTART) {
- longest = NULL;
- len = 0;
- for (; scan != NULL; scan = regnext(scan))
- if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
- longest = OPERAND(scan);
- len = strlen(OPERAND(scan));
- }
- r->regmust = longest;
- r->regmlen = len;
- }
- }
-
- return(r);
-}
-
-/*
- - reg - regular expression, i.e. main body or parenthesized thing
- *
- * Caller must absorb opening parenthesis.
- *
- * Combining parenthesis handling with the base level of regular expression
- * is a trifle forced, but the need to tie the tails of the branches to what
- * follows makes it hard to avoid.
- */
-static char *
-reg(
- int paren, /* Parenthesized? */
- int *flagp )
-{
- register char *ret;
- register char *br;
- register char *ender;
- register int parno = 0;
- int flags;
-
- *flagp = HASWIDTH; /* Tentatively. */
-
- /* Make an OPEN node, if parenthesized. */
- if (paren) {
- if (regnpar >= NSUBEXP)
- FAIL("too many ()");
- parno = regnpar;
- regnpar++;
- ret = regnode(OPEN+parno);
- } else
- ret = NULL;
-
- /* Pick up the branches, linking them together. */
- br = regbranch(&flags);
- if (br == NULL)
- return(NULL);
- if (ret != NULL)
- regtail(ret, br); /* OPEN -> first. */
- else
- ret = br;
- if (!(flags&HASWIDTH))
- *flagp &= ~HASWIDTH;
- *flagp |= flags&SPSTART;
- while (*regparse == '|' || *regparse == '\n') {
- regparse++;
- br = regbranch(&flags);
- if (br == NULL)
- return(NULL);
- regtail(ret, br); /* BRANCH -> BRANCH. */
- if (!(flags&HASWIDTH))
- *flagp &= ~HASWIDTH;
- *flagp |= flags&SPSTART;
- }
-
- /* Make a closing node, and hook it on the end. */
- ender = regnode((paren) ? CLOSE+parno : END);
- regtail(ret, ender);
-
- /* Hook the tails of the branches to the closing node. */
- for (br = ret; br != NULL; br = regnext(br))
- regoptail(br, ender);
-
- /* Check for proper termination. */
- if (paren && *regparse++ != ')') {
- FAIL("unmatched ()");
- } else if (!paren && *regparse != '\0') {
- if (*regparse == ')') {
- FAIL("unmatched ()");
- } else
- FAIL("junk on end"); /* "Can't happen". */
- /* NOTREACHED */
- }
-
- return(ret);
-}
-
-/*
- - regbranch - one alternative of an | operator
- *
- * Implements the concatenation operator.
- */
-static char *
-regbranch( int *flagp )
-{
- register char *ret;
- register char *chain;
- register char *latest;
- int flags;
-
- *flagp = WORST; /* Tentatively. */
-
- ret = regnode(BRANCH);
- chain = NULL;
- while (*regparse != '\0' && *regparse != ')' &&
- *regparse != '\n' && *regparse != '|') {
- latest = regpiece(&flags);
- if (latest == NULL)
- return(NULL);
- *flagp |= flags&HASWIDTH;
- if (chain == NULL) /* First piece. */
- *flagp |= flags&SPSTART;
- else
- regtail(chain, latest);
- chain = latest;
- }
- if (chain == NULL) /* Loop ran zero times. */
- (void) regnode(NOTHING);
-
- return(ret);
-}
-
-/*
- - regpiece - something followed by possible [*+?]
- *
- * Note that the branching code sequences used for ? and the general cases
- * of * and + are somewhat optimized: they use the same NOTHING node as
- * both the endmarker for their branch list and the body of the last branch.
- * It might seem that this node could be dispensed with entirely, but the
- * endmarker role is not redundant.
- */
-static char *
-regpiece( int *flagp )
-{
- register char *ret;
- register char op;
- register char *next;
- int flags;
-
- ret = regatom(&flags);
- if (ret == NULL)
- return(NULL);
-
- op = *regparse;
- if (!ISMULT(op)) {
- *flagp = flags;
- return(ret);
- }
-
- if (!(flags&HASWIDTH) && op != '?')
- FAIL("*+ operand could be empty");
- *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
-
- if (op == '*' && (flags&SIMPLE))
- reginsert(STAR, ret);
- else if (op == '*') {
- /* Emit x* as (x&|), where & means "self". */
- reginsert(BRANCH, ret); /* Either x */
- regoptail(ret, regnode(BACK)); /* and loop */
- regoptail(ret, ret); /* back */
- regtail(ret, regnode(BRANCH)); /* or */
- regtail(ret, regnode(NOTHING)); /* null. */
- } else if (op == '+' && (flags&SIMPLE))
- reginsert(PLUS, ret);
- else if (op == '+') {
- /* Emit x+ as x(&|), where & means "self". */
- next = regnode(BRANCH); /* Either */
- regtail(ret, next);
- regtail(regnode(BACK), ret); /* loop back */
- regtail(next, regnode(BRANCH)); /* or */
- regtail(ret, regnode(NOTHING)); /* null. */
- } else if (op == '?') {
- /* Emit x? as (x|) */
- reginsert(BRANCH, ret); /* Either x */
- regtail(ret, regnode(BRANCH)); /* or */
- next = regnode(NOTHING); /* null. */
- regtail(ret, next);
- regoptail(ret, next);
- }
- regparse++;
- if (ISMULT(*regparse))
- FAIL("nested *?+");
-
- return(ret);
-}
-
-/*
- - regatom - the lowest level
- *
- * Optimization: gobbles an entire sequence of ordinary characters so that
- * it can turn them into a single node, which is smaller to store and
- * faster to run. Backslashed characters are exceptions, each becoming a
- * separate node; the code is simpler that way and it's not worth fixing.
- */
-static char *
-regatom( int *flagp )
-{
- register char *ret;
- int flags;
-
- *flagp = WORST; /* Tentatively. */
-
- switch (*regparse++) {
- /* FIXME: these chars only have meaning at beg/end of pat? */
- case '^':
- ret = regnode(BOL);
- break;
- case '$':
- ret = regnode(EOL);
- break;
- case '.':
- ret = regnode(ANY);
- *flagp |= HASWIDTH|SIMPLE;
- break;
- case '[': {
- register int classr;
- register int classend;
-
- if (*regparse == '^') { /* Complement of range. */
- ret = regnode(ANYBUT);
- regparse++;
- } else
- ret = regnode(ANYOF);
- if (*regparse == ']' || *regparse == '-')
- regc(*regparse++);
- while (*regparse != '\0' && *regparse != ']') {
- if (*regparse == '-') {
- regparse++;
- if (*regparse == ']' || *regparse == '\0')
- regc('-');
- else {
- classr = UCHARAT(regparse-2)+1;
- classend = UCHARAT(regparse);
- if (classr > classend+1)
- FAIL("invalid [] range");
- for (; classr <= classend; classr++)
- regc(classr);
- regparse++;
- }
- } else
- regc(*regparse++);
- }
- regc('\0');
- if (*regparse != ']')
- FAIL("unmatched []");
- regparse++;
- *flagp |= HASWIDTH|SIMPLE;
- }
- break;
- case '(':
- ret = reg(1, &flags);
- if (ret == NULL)
- return(NULL);
- *flagp |= flags&(HASWIDTH|SPSTART);
- break;
- case '\0':
- case '|':
- case '\n':
- case ')':
- FAIL("internal urp"); /* Supposed to be caught earlier. */
- break;
- case '?':
- case '+':
- case '*':
- FAIL("?+* follows nothing");
- break;
- case '\\':
- switch (*regparse++) {
- case '\0':
- FAIL("trailing \\");
- break;
- case '<':
- ret = regnode(WORDA);
- break;
- case '>':
- ret = regnode(WORDZ);
- break;
- /* FIXME: Someday handle \1, \2, ... */
- default:
- /* Handle general quoted chars in exact-match routine */
- goto de_fault;
- }
- break;
- de_fault:
- default:
- /*
- * Encode a string of characters to be matched exactly.
- *
- * This is a bit tricky due to quoted chars and due to
- * '*', '+', and '?' taking the SINGLE char previous
- * as their operand.
- *
- * On entry, the char at regparse[-1] is going to go
- * into the string, no matter what it is. (It could be
- * following a \ if we are entered from the '\' case.)
- *
- * Basic idea is to pick up a good char in ch and
- * examine the next char. If it's *+? then we twiddle.
- * If it's \ then we frozzle. If it's other magic char
- * we push ch and terminate the string. If none of the
- * above, we push ch on the string and go around again.
- *
- * regprev is used to remember where "the current char"
- * starts in the string, if due to a *+? we need to back
- * up and put the current char in a separate, 1-char, string.
- * When regprev is NULL, ch is the only char in the
- * string; this is used in *+? handling, and in setting
- * flags |= SIMPLE at the end.
- */
- {
- char *regprev;
- register char ch;
-
- regparse--; /* Look at cur char */
- ret = regnode(EXACTLY);
- for ( regprev = 0 ; ; ) {
- ch = *regparse++; /* Get current char */
- switch (*regparse) { /* look at next one */
-
- default:
- regc(ch); /* Add cur to string */
- break;
-
- case '.': case '[': case '(':
- case ')': case '|': case '\n':
- case '$': case '^':
- case '\0':
- /* FIXME, $ and ^ should not always be magic */
- magic:
- regc(ch); /* dump cur char */
- goto done; /* and we are done */
-
- case '?': case '+': case '*':
- if (!regprev) /* If just ch in str, */
- goto magic; /* use it */
- /* End mult-char string one early */
- regparse = regprev; /* Back up parse */
- goto done;
-
- case '\\':
- regc(ch); /* Cur char OK */
- switch (regparse[1]){ /* Look after \ */
- case '\0':
- case '<':
- case '>':
- /* FIXME: Someday handle \1, \2, ... */
- goto done; /* Not quoted */
- default:
- /* Backup point is \, scan * point is after it. */
- regprev = regparse;
- regparse++;
- continue; /* NOT break; */
- }
- }
- regprev = regparse; /* Set backup point */
- }
- done:
- regc('\0');
- *flagp |= HASWIDTH;
- if (!regprev) /* One char? */
- *flagp |= SIMPLE;
- }
- break;
- }
-
- return(ret);
-}
-
-/*
- - regnode - emit a node
- */
-static char * /* Location. */
-regnode( int op )
-{
- register char *ret;
- register char *ptr;
-
- ret = regcode;
- if (ret == ®dummy) {
- regsize += 3;
- return(ret);
- }
-
- ptr = ret;
- *ptr++ = op;
- *ptr++ = '\0'; /* Null "next" pointer. */
- *ptr++ = '\0';
- regcode = ptr;
-
- return(ret);
-}
-
-/*
- - regc - emit (if appropriate) a byte of code
- */
-static void
-regc( int b )
-{
- if (regcode != ®dummy)
- *regcode++ = b;
- else
- regsize++;
-}
-
-/*
- - reginsert - insert an operator in front of already-emitted operand
- *
- * Means relocating the operand.
- */
-static void
-reginsert(
- char op,
- char *opnd )
-{
- register char *src;
- register char *dst;
- register char *place;
-
- if (regcode == ®dummy) {
- regsize += 3;
- return;
- }
-
- src = regcode;
- regcode += 3;
- dst = regcode;
- while (src > opnd)
- *--dst = *--src;
-
- place = opnd; /* Op node, where operand used to be. */
- *place++ = op;
- *place++ = '\0';
- *place++ = '\0';
-}
-
-/*
- - regtail - set the next-pointer at the end of a node chain
- */
-static void
-regtail(
- char *p,
- char *val )
-{
- register char *scan;
- register char *temp;
- register int offset;
-
- if (p == ®dummy)
- return;
-
- /* Find last node. */
- scan = p;
- for (;;) {
- temp = regnext(scan);
- if (temp == NULL)
- break;
- scan = temp;
- }
-
- if (OP(scan) == BACK)
- offset = scan - val;
- else
- offset = val - scan;
- *(scan+1) = (offset>>8)&0377;
- *(scan+2) = offset&0377;
-}
-
-/*
- - regoptail - regtail on operand of first argument; nop if operandless
- */
-
-static void
-regoptail(
- char *p,
- char *val )
-{
- /* "Operandless" and "op != BRANCH" are synonymous in practice. */
- if (p == NULL || p == ®dummy || OP(p) != BRANCH)
- return;
- regtail(OPERAND(p), val);
-}
-
-/*
- * regexec and friends
- */
-
-/*
- * Global work variables for regexec().
- */
-static const char *reginput; /* String-input pointer. */
-static const char *regbol; /* Beginning of input, for ^ check. */
-static const char **regstartp; /* Pointer to startp array. */
-static const char **regendp; /* Ditto for endp. */
-
-/*
- * Forwards.
- */
-STATIC int regtry( regexp *prog, const char *string );
-STATIC int regmatch( char *prog );
-STATIC int regrepeat( char *p );
-
-#ifdef DEBUG
-int regnarrate = 0;
-void regdump();
-STATIC char *regprop();
-#endif
-
-/*
- - regexec - match a regexp against a string
- */
-int
-regexec(
- register regexp *prog,
- register const char *string )
-{
- register char *s;
-
- /* Be paranoid... */
- if (prog == NULL || string == NULL) {
- regerror("NULL parameter");
- return(0);
- }
-
- /* Check validity of program. */
- if (UCHARAT(prog->program) != MAGIC) {
- regerror("corrupted program");
- return(0);
- }
-
- /* If there is a "must appear" string, look for it. */
- if ( prog->regmust != NULL )
- {
- s = (char *)string;
- while ( ( s = strchr( s, prog->regmust[ 0 ] ) ) != NULL )
- {
- if ( !strncmp( s, prog->regmust, prog->regmlen ) )
- break; /* Found it. */
- ++s;
- }
- if ( s == NULL ) /* Not present. */
- return 0;
- }
-
- /* Mark beginning of line for ^ . */
- regbol = (char *)string;
-
- /* Simplest case: anchored match need be tried only once. */
- if ( prog->reganch )
- return regtry( prog, string );
-
- /* Messy cases: unanchored match. */
- s = (char *)string;
- if (prog->regstart != '\0')
- /* We know what char it must start with. */
- while ((s = strchr(s, prog->regstart)) != NULL) {
- if (regtry(prog, s))
- return(1);
- s++;
- }
- else
- /* We do not -- general case. */
- do {
- if ( regtry( prog, s ) )
- return( 1 );
- } while ( *s++ != '\0' );
-
- /* Failure. */
- return 0;
-}
-
-
-/*
- * regtry() - try match at specific point.
- */
-
-static int /* 0 failure, 1 success */
-regtry(
- regexp *prog,
- const char *string )
-{
- register int i;
- register const char * * sp;
- register const char * * ep;
-
- reginput = string;
- regstartp = prog->startp;
- regendp = prog->endp;
-
- sp = prog->startp;
- ep = prog->endp;
- for ( i = NSUBEXP; i > 0; --i )
- {
- *sp++ = NULL;
- *ep++ = NULL;
- }
- if ( regmatch( prog->program + 1 ) )
- {
- prog->startp[ 0 ] = string;
- prog->endp[ 0 ] = reginput;
- return 1;
- }
- else
- return 0;
-}
-
-
-/*
- * regmatch() - main matching routine.
- *
- * Conceptually the strategy is simple: check to see whether the current node
- * matches, call self recursively to see whether the rest matches, and then act
- * accordingly. In practice we make some effort to avoid recursion, in
- * particular by going through "ordinary" nodes (that do not need to know
- * whether the rest of the match failed) by a loop instead of by recursion.
- */
-
-static int /* 0 failure, 1 success */
-regmatch( char * prog )
-{
- char * scan; /* Current node. */
- char * next; /* Next node. */
-
- scan = prog;
-#ifdef DEBUG
- if (scan != NULL && regnarrate)
- fprintf(stderr, "%s(\n", regprop(scan));
-#endif
- while (scan != NULL) {
-#ifdef DEBUG
- if (regnarrate)
- fprintf(stderr, "%s...\n", regprop(scan));
-#endif
- next = regnext(scan);
-
- switch (OP(scan)) {
- case BOL:
- if (reginput != regbol)
- return(0);
- break;
- case EOL:
- if (*reginput != '\0')
- return(0);
- break;
- case WORDA:
- /* Must be looking at a letter, digit, or _ */
- if ((!isalnum(*reginput)) && *reginput != '_')
- return(0);
- /* Prev must be BOL or nonword */
- if (reginput > regbol &&
- (isalnum(reginput[-1]) || reginput[-1] == '_'))
- return(0);
- break;
- case WORDZ:
- /* Must be looking at non letter, digit, or _ */
- if (isalnum(*reginput) || *reginput == '_')
- return(0);
- /* We don't care what the previous char was */
- break;
- case ANY:
- if (*reginput == '\0')
- return(0);
- reginput++;
- break;
- case EXACTLY: {
- register int len;
- register char *opnd;
-
- opnd = OPERAND(scan);
- /* Inline the first character, for speed. */
- if (*opnd != *reginput)
- return(0);
- len = strlen(opnd);
- if (len > 1 && strncmp(opnd, reginput, len) != 0)
- return(0);
- reginput += len;
- }
- break;
- case ANYOF:
- if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
- return(0);
- reginput++;
- break;
- case ANYBUT:
- if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
- return(0);
- reginput++;
- break;
- case NOTHING:
- break;
- case BACK:
- break;
- case OPEN+1:
- case OPEN+2:
- case OPEN+3:
- case OPEN+4:
- case OPEN+5:
- case OPEN+6:
- case OPEN+7:
- case OPEN+8:
- case OPEN+9: {
- register int no;
- register const char *save;
-
- no = OP(scan) - OPEN;
- save = reginput;
-
- if (regmatch(next)) {
- /*
- * Don't set startp if some later
- * invocation of the same parentheses
- * already has.
- */
- if (regstartp[no] == NULL)
- regstartp[no] = save;
- return(1);
- } else
- return(0);
- }
- break;
- case CLOSE+1:
- case CLOSE+2:
- case CLOSE+3:
- case CLOSE+4:
- case CLOSE+5:
- case CLOSE+6:
- case CLOSE+7:
- case CLOSE+8:
- case CLOSE+9: {
- register int no;
- register const char *save;
-
- no = OP(scan) - CLOSE;
- save = reginput;
-
- if (regmatch(next)) {
- /*
- * Don't set endp if some later
- * invocation of the same parentheses
- * already has.
- */
- if (regendp[no] == NULL)
- regendp[no] = save;
- return(1);
- } else
- return(0);
- }
- break;
- case BRANCH: {
- register const char *save;
-
- if (OP(next) != BRANCH) /* No choice. */
- next = OPERAND(scan); /* Avoid recursion. */
- else {
- do {
- save = reginput;
- if (regmatch(OPERAND(scan)))
- return(1);
- reginput = save;
- scan = regnext(scan);
- } while (scan != NULL && OP(scan) == BRANCH);
- return(0);
- /* NOTREACHED */
- }
- }
- break;
- case STAR:
- case PLUS: {
- register char nextch;
- register int no;
- register const char *save;
- register int min;
-
- /*
- * Lookahead to avoid useless match attempts
- * when we know what character comes next.
- */
- nextch = '\0';
- if (OP(next) == EXACTLY)
- nextch = *OPERAND(next);
- min = (OP(scan) == STAR) ? 0 : 1;
- save = reginput;
- no = regrepeat(OPERAND(scan));
- while (no >= min) {
- /* If it could work, try it. */
- if (nextch == '\0' || *reginput == nextch)
- if (regmatch(next))
- return(1);
- /* Couldn't or didn't -- back up. */
- no--;
- reginput = save + no;
- }
- return(0);
- }
- break;
- case END:
- return(1); /* Success! */
- break;
- default:
- regerror("memory corruption");
- return(0);
- break;
- }
-
- scan = next;
- }
-
- /*
- * We get here only if there's trouble -- normally "case END" is
- * the terminating point.
- */
- regerror("corrupted pointers");
- return(0);
-}
-
-/*
- - regrepeat - repeatedly match something simple, report how many
- */
-static int
-regrepeat( char *p )
-{
- register int count = 0;
- register const char *scan;
- register char *opnd;
-
- scan = reginput;
- opnd = OPERAND(p);
- switch (OP(p)) {
- case ANY:
- count = strlen(scan);
- scan += count;
- break;
- case EXACTLY:
- while (*opnd == *scan) {
- count++;
- scan++;
- }
- break;
- case ANYOF:
- while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
- count++;
- scan++;
- }
- break;
- case ANYBUT:
- while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
- count++;
- scan++;
- }
- break;
- default: /* Oh dear. Called inappropriately. */
- regerror("internal foulup");
- count = 0; /* Best compromise. */
- break;
- }
- reginput = scan;
-
- return(count);
-}
-
-/*
- - regnext - dig the "next" pointer out of a node
- */
-static char *
-regnext( register char *p )
-{
- register int offset;
-
- if (p == ®dummy)
- return(NULL);
-
- offset = NEXT(p);
- if (offset == 0)
- return(NULL);
-
- if (OP(p) == BACK)
- return(p-offset);
- else
- return(p+offset);
-}
-
-#ifdef DEBUG
-
-STATIC char *regprop();
-
-/*
- - regdump - dump a regexp onto stdout in vaguely comprehensible form
- */
-void
-regdump( regexp *r )
-{
- register char *s;
- register char op = EXACTLY; /* Arbitrary non-END op. */
- register char *next;
-
-
- s = r->program + 1;
- while (op != END) { /* While that wasn't END last time... */
- op = OP(s);
- printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
- next = regnext(s);
- if (next == NULL) /* Next ptr. */
- printf("(0)");
- else
- printf("(%d)", (s-r->program)+(next-s));
- s += 3;
- if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
- /* Literal string, where present. */
- while (*s != '\0') {
- putchar(*s);
- s++;
- }
- s++;
- }
- putchar('\n');
- }
-
- /* Header fields of interest. */
- if (r->regstart != '\0')
- printf("start `%c' ", r->regstart);
- if (r->reganch)
- printf("anchored ");
- if (r->regmust != NULL)
- printf("must have \"%s\"", r->regmust);
- printf("\n");
-}
-
-/*
- - regprop - printable representation of opcode
- */
-static char *
-regprop( char *op )
-{
- register char *p;
- static char buf[50];
-
- (void) strcpy(buf, ":");
-
- switch (OP(op)) {
- case BOL:
- p = "BOL";
- break;
- case EOL:
- p = "EOL";
- break;
- case ANY:
- p = "ANY";
- break;
- case ANYOF:
- p = "ANYOF";
- break;
- case ANYBUT:
- p = "ANYBUT";
- break;
- case BRANCH:
- p = "BRANCH";
- break;
- case EXACTLY:
- p = "EXACTLY";
- break;
- case NOTHING:
- p = "NOTHING";
- break;
- case BACK:
- p = "BACK";
- break;
- case END:
- p = "END";
- break;
- case OPEN+1:
- case OPEN+2:
- case OPEN+3:
- case OPEN+4:
- case OPEN+5:
- case OPEN+6:
- case OPEN+7:
- case OPEN+8:
- case OPEN+9:
- sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
- p = NULL;
- break;
- case CLOSE+1:
- case CLOSE+2:
- case CLOSE+3:
- case CLOSE+4:
- case CLOSE+5:
- case CLOSE+6:
- case CLOSE+7:
- case CLOSE+8:
- case CLOSE+9:
- sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
- p = NULL;
- break;
- case STAR:
- p = "STAR";
- break;
- case PLUS:
- p = "PLUS";
- break;
- case WORDA:
- p = "WORDA";
- break;
- case WORDZ:
- p = "WORDZ";
- break;
- default:
- regerror("corrupted opcode");
- break;
- }
- if (p != NULL)
- (void) strcat(buf, p);
- return(buf);
-}
-#endif
-
-/*
- * The following is provided for those people who do not have strcspn() in
- * their C libraries. They should get off their butts and do something
- * about it; at least one public-domain implementation of those (highly
- * useful) string routines has been published on Usenet.
- */
-#ifdef STRCSPN
-/*
- * strcspn - find length of initial segment of s1 consisting entirely
- * of characters not from s2
- */
-
-static int
-strcspn(
- char *s1,
- char *s2 )
-{
- register char *scan1;
- register char *scan2;
- register int count;
-
- count = 0;
- for (scan1 = s1; *scan1 != '\0'; scan1++) {
- for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
- if (*scan1 == *scan2++)
- return(count);
- count++;
- }
- return(count);
-}
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/jam-files/engine/regexp.h
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/jam-files/engine/regexp.h b/ext/kenlm/jam-files/engine/regexp.h
deleted file mode 100644
index 6898ccd..0000000
--- a/ext/kenlm/jam-files/engine/regexp.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Definitions etc. for regexp(3) routines.
- *
- * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof],
- * not the System V one.
- */
-#ifndef REGEXP_DWA20011023_H
-#define REGEXP_DWA20011023_H
-
-#define NSUBEXP 10
-typedef struct regexp {
- char const * startp[ NSUBEXP ];
- char const * endp[ NSUBEXP ];
- char regstart; /* Internal use only. */
- char reganch; /* Internal use only. */
- char * regmust; /* Internal use only. */
- int regmlen; /* Internal use only. */
- char program[ 1 ]; /* Unwarranted chumminess with compiler. */
-} regexp;
-
-
-regexp * regcomp( char const * exp );
-int regexec( regexp * prog, char const * string );
-void regerror( char const * s );
-
-
-/*
- * The first byte of the regexp internal "program" is actually this magic
- * number; the start node begins in the second byte.
- */
-#define MAGIC 0234
-
-#endif
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/jam-files/engine/rules.c
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/jam-files/engine/rules.c b/ext/kenlm/jam-files/engine/rules.c
deleted file mode 100644
index 7947c55..0000000
--- a/ext/kenlm/jam-files/engine/rules.c
+++ /dev/null
@@ -1,740 +0,0 @@
-/*
- * Copyright 1993, 1995 Christopher Seiwald.
- *
- * This file is part of Jam - see jam.c for Copyright information.
- */
-
-/* This file is ALSO:
- * Copyright 2001-2004 David Abrahams.
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying file LICENSE_1_0.txt or http://www.boost.org/LICENSE_1_0.txt)
- */
-
-/*
- * rules.c - access to RULEs, TARGETs, and ACTIONs
- *
- * External routines:
- * bindrule() - return pointer to RULE, creating it if necessary.
- * bindtarget() - return pointer to TARGET, creating it if necessary.
- * touch_target() - mark a target to simulate being new.
- * targetlist() - turn list of target names into a TARGET chain.
- * targetentry() - add a TARGET to a chain of TARGETS.
- * actionlist() - append to an ACTION chain.
- * addsettings() - add a deferred "set" command to a target.
- * pushsettings() - set all target specific variables.
- * popsettings() - reset target specific variables to their pre-push values.
- * freesettings() - delete a settings list.
- * rules_done() - free RULE and TARGET tables.
- */
-
-#include "jam.h"
-#include "rules.h"
-
-#include "hash.h"
-#include "lists.h"
-#include "object.h"
-#include "parse.h"
-#include "pathsys.h"
-#include "search.h"
-#include "variable.h"
-
-
-static void set_rule_actions( RULE *, rule_actions * );
-static void set_rule_body ( RULE *, FUNCTION * );
-
-static struct hash * targethash = 0;
-
-
-/*
- * get_target_includes() - lazy creates a target's internal includes node
- *
- * The newly created node is not entered into the hash table as there should
- * never be a need to bind them directly from a target names. If you want to
- * access an internal includes node by name, first access the actual target and
- * then read the internal includes node from there.
- */
-
-static TARGET * get_target_includes( TARGET * const t )
-{
- if ( !t->includes )
- {
- TARGET * const i = (TARGET *)BJAM_MALLOC( sizeof( *t ) );
- memset( (char *)i, '\0', sizeof( *i ) );
- i->name = object_copy( t->name );
- i->boundname = object_copy( i->name );
- i->flags |= T_FLAG_NOTFILE | T_FLAG_INTERNAL;
- i->original_target = t;
- t->includes = i;
- }
- return t->includes;
-}
-
-
-/*
- * target_include() - adds a target to the given targe's 'included' list
- * target_include_many() - adds targets to the given target's 'included' list
- *
- * Included targets are modeled as dependencies of the including target's
- * internal include node.
- */
-
-void target_include( TARGET * const including, TARGET * const included )
-{
- TARGET * const internal = get_target_includes( including );
- internal->depends = targetentry( internal->depends, included );
-}
-
-void target_include_many( TARGET * const including, LIST * const included_names
- )
-{
- TARGET * const internal = get_target_includes( including );
- internal->depends = targetlist( internal->depends, included_names );
-}
-
-
-/*
- * enter_rule() - return pointer to RULE, creating it if necessary in
- * target_module.
- */
-
-static RULE * enter_rule( OBJECT * rulename, module_t * target_module )
-{
- int found;
- RULE * const r = (RULE *)hash_insert( demand_rules( target_module ),
- rulename, &found );
- if ( !found )
- {
- r->name = object_copy( rulename );
- r->procedure = 0;
- r->module = 0;
- r->actions = 0;
- r->exported = 0;
- r->module = target_module;
- }
- return r;
-}
-
-
-/*
- * define_rule() - return pointer to RULE, creating it if necessary in
- * target_module. Prepare it to accept a body or action originating in
- * src_module.
- */
-
-static RULE * define_rule( module_t * src_module, OBJECT * rulename,
- module_t * target_module )
-{
- RULE * const r = enter_rule( rulename, target_module );
- if ( r->module != src_module )
- {
- /* If the rule was imported from elsewhere, clear it now. */
- set_rule_body( r, 0 );
- set_rule_actions( r, 0 );
- /* r will be executed in the source module. */
- r->module = src_module;
- }
- return r;
-}
-
-
-void rule_free( RULE * r )
-{
- object_free( r->name );
- r->name = 0;
- if ( r->procedure )
- function_free( r->procedure );
- r->procedure = 0;
- if ( r->actions )
- actions_free( r->actions );
- r->actions = 0;
-}
-
-
-/*
- * bindtarget() - return pointer to TARGET, creating it if necessary.
- */
-
-TARGET * bindtarget( OBJECT * const target_name )
-{
- int found;
- TARGET * t;
-
- if ( !targethash )
- targethash = hashinit( sizeof( TARGET ), "targets" );
-
- t = (TARGET *)hash_insert( targethash, target_name, &found );
- if ( !found )
- {
- memset( (char *)t, '\0', sizeof( *t ) );
- t->name = object_copy( target_name );
- t->boundname = object_copy( t->name ); /* default for T_FLAG_NOTFILE */
- }
-
- return t;
-}
-
-
-static void bind_explicitly_located_target( void * xtarget, void * data )
-{
- TARGET * t = (TARGET *)xtarget;
- if ( !( t->flags & T_FLAG_NOTFILE ) )
- {
- /* Check if there is a setting for LOCATE. */
- SETTINGS * s = t->settings;
- for ( ; s ; s = s->next )
- {
- if ( object_equal( s->symbol, constant_LOCATE ) && ! list_empty( s->value ) )
- {
- set_explicit_binding( t->name, list_front( s->value ) );
- break;
- }
- }
- }
-}
-
-
-void bind_explicitly_located_targets()
-{
- if ( targethash )
- hashenumerate( targethash, bind_explicitly_located_target, (void *)0 );
-}
-
-
-/*
- * touch_target() - mark a target to simulate being new.
- */
-
-void touch_target( OBJECT * const t )
-{
- bindtarget( t )->flags |= T_FLAG_TOUCHED;
-}
-
-
-/*
- * target_scc() - returns the root of a strongly connected component that this
- * target is a part of.
- */
-
-TARGET * target_scc( TARGET * t )
-{
- TARGET * result = t;
- while ( result->scc_root )
- result = result->scc_root;
- while ( t->scc_root )
- {
- TARGET * const tmp = t->scc_root;
- t->scc_root = result;
- t = tmp;
- }
- return result;
-}
-
-
-/*
- * targetlist() - turn list of target names into a TARGET chain.
- *
- * Inputs:
- * chain existing TARGETS to append to
- * targets list of target names
- */
-
-TARGETS * targetlist( TARGETS * chain, LIST * target_names )
-{
- LISTITER iter = list_begin( target_names );
- LISTITER const end = list_end( target_names );
- for ( ; iter != end; iter = list_next( iter ) )
- chain = targetentry( chain, bindtarget( list_item( iter ) ) );
- return chain;
-}
-
-
-/*
- * targetentry() - add a TARGET to a chain of TARGETS.
- *
- * Inputs:
- * chain existing TARGETS to append to
- * target new target to append
- */
-
-TARGETS * targetentry( TARGETS * chain, TARGET * target )
-{
- TARGETS * const c = (TARGETS *)BJAM_MALLOC( sizeof( TARGETS ) );
- c->target = target;
-
- if ( !chain ) chain = c;
- else chain->tail->next = c;
- chain->tail = c;
- c->next = 0;
-
- return chain;
-}
-
-
-/*
- * targetchain() - append two TARGET chains.
- *
- * Inputs:
- * chain existing TARGETS to append to
- * target new target to append
- */
-
-TARGETS * targetchain( TARGETS * chain, TARGETS * targets )
-{
- if ( !targets ) return chain;
- if ( !chain ) return targets;
-
- chain->tail->next = targets;
- chain->tail = targets->tail;
- return chain;
-}
-
-/*
- * action_free - decrement the ACTIONs refrence count and (maybe) free it.
- */
-
-void action_free( ACTION * action )
-{
- if ( --action->refs == 0 )
- {
- freetargets( action->targets );
- freetargets( action->sources );
- BJAM_FREE( action );
- }
-}
-
-
-/*
- * actionlist() - append to an ACTION chain.
- */
-
-ACTIONS * actionlist( ACTIONS * chain, ACTION * action )
-{
- ACTIONS * const actions = (ACTIONS *)BJAM_MALLOC( sizeof( ACTIONS ) );
- actions->action = action;
- ++action->refs;
- if ( !chain ) chain = actions;
- else chain->tail->next = actions;
- chain->tail = actions;
- actions->next = 0;
- return chain;
-}
-
-static SETTINGS * settings_freelist;
-
-
-/*
- * addsettings() - add a deferred "set" command to a target.
- *
- * Adds a variable setting (varname=list) onto a chain of settings for a
- * particular target. 'flag' controls the relationship between new and old
- * values in the same way as in var_set() function (see variable.c). Returns the
- * head of the settings chain.
- */
-
-SETTINGS * addsettings( SETTINGS * head, int flag, OBJECT * symbol,
- LIST * value )
-{
- SETTINGS * v;
-
- /* Look for previous settings. */
- for ( v = head; v; v = v->next )
- if ( object_equal( v->symbol, symbol ) )
- break;
-
- /* If not previously set, alloc a new. */
- /* If appending, do so. */
- /* Else free old and set new. */
- if ( !v )
- {
- v = settings_freelist;
- if ( v )
- settings_freelist = v->next;
- else
- v = (SETTINGS *)BJAM_MALLOC( sizeof( *v ) );
-
- v->symbol = object_copy( symbol );
- v->value = value;
- v->next = head;
- head = v;
- }
- else if ( flag == VAR_APPEND )
- {
- v->value = list_append( v->value, value );
- }
- else if ( flag != VAR_DEFAULT )
- {
- list_free( v->value );
- v->value = value;
- }
- else
- list_free( value );
-
- /* Return (new) head of list. */
- return head;
-}
-
-
-/*
- * pushsettings() - set all target specific variables.
- */
-
-void pushsettings( struct module_t * module, SETTINGS * v )
-{
- for ( ; v; v = v->next )
- v->value = var_swap( module, v->symbol, v->value );
-}
-
-
-/*
- * popsettings() - reset target specific variables to their pre-push values.
- */
-
-void popsettings( struct module_t * module, SETTINGS * v )
-{
- pushsettings( module, v ); /* just swap again */
-}
-
-
-/*
- * copysettings() - duplicate a settings list, returning the new copy.
- */
-
-SETTINGS * copysettings( SETTINGS * head )
-{
- SETTINGS * copy = 0;
- SETTINGS * v;
- for ( v = head; v; v = v->next )
- copy = addsettings( copy, VAR_SET, v->symbol, list_copy( v->value ) );
- return copy;
-}
-
-
-/*
- * freetargets() - delete a targets list.
- */
-
-void freetargets( TARGETS * chain )
-{
- while ( chain )
- {
- TARGETS * const n = chain->next;
- BJAM_FREE( chain );
- chain = n;
- }
-}
-
-
-/*
- * freeactions() - delete an action list.
- */
-
-void freeactions( ACTIONS * chain )
-{
- while ( chain )
- {
- ACTIONS * const n = chain->next;
- action_free( chain->action );
- BJAM_FREE( chain );
- chain = n;
- }
-}
-
-
-/*
- * freesettings() - delete a settings list.
- */
-
-void freesettings( SETTINGS * v )
-{
- while ( v )
- {
- SETTINGS * const n = v->next;
- object_free( v->symbol );
- list_free( v->value );
- v->next = settings_freelist;
- settings_freelist = v;
- v = n;
- }
-}
-
-
-static void freetarget( void * xt, void * data )
-{
- TARGET * const t = (TARGET *)xt;
- if ( t->name ) object_free ( t->name );
- if ( t->boundname ) object_free ( t->boundname );
- if ( t->settings ) freesettings( t->settings );
- if ( t->depends ) freetargets ( t->depends );
- if ( t->dependants ) freetargets ( t->dependants );
- if ( t->parents ) freetargets ( t->parents );
- if ( t->actions ) freeactions ( t->actions );
- if ( t->includes )
- {
- freetarget( t->includes, (void *)0 );
- BJAM_FREE( t->includes );
- }
-}
-
-
-/*
- * rules_done() - free RULE and TARGET tables.
- */
-
-void rules_done()
-{
- if ( targethash )
- {
- hashenumerate( targethash, freetarget, 0 );
- hashdone( targethash );
- }
- while ( settings_freelist )
- {
- SETTINGS * const n = settings_freelist->next;
- BJAM_FREE( settings_freelist );
- settings_freelist = n;
- }
-}
-
-
-/*
- * actions_refer() - add a new reference to the given actions.
- */
-
-void actions_refer( rule_actions * a )
-{
- ++a->reference_count;
-}
-
-
-/*
- * actions_free() - release a reference to given actions.
- */
-
-void actions_free( rule_actions * a )
-{
- if ( --a->reference_count <= 0 )
- {
- function_free( a->command );
- list_free( a->bindlist );
- BJAM_FREE( a );
- }
-}
-
-
-/*
- * set_rule_body() - set the argument list and procedure of the given rule.
- */
-
-static void set_rule_body( RULE * rule, FUNCTION * procedure )
-{
- if ( procedure )
- function_refer( procedure );
- if ( rule->procedure )
- function_free( rule->procedure );
- rule->procedure = procedure;
-}
-
-
-/*
- * global_name() - given a rule, return the name for a corresponding rule in the
- * global module.
- */
-
-static OBJECT * global_rule_name( RULE * r )
-{
- if ( r->module == root_module() )
- return object_copy( r->name );
-
- {
- char name[ 4096 ] = "";
- if ( r->module->name )
- {
- strncat( name, object_str( r->module->name ), sizeof( name ) - 1 );
- strncat( name, ".", sizeof( name ) - 1 );
- }
- strncat( name, object_str( r->name ), sizeof( name ) - 1 );
- return object_new( name );
- }
-}
-
-
-/*
- * global_rule() - given a rule, produce a corresponding entry in the global
- * module.
- */
-
-static RULE * global_rule( RULE * r )
-{
- if ( r->module == root_module() )
- return r;
-
- {
- OBJECT * const name = global_rule_name( r );
- RULE * const result = define_rule( r->module, name, root_module() );
- object_free( name );
- return result;
- }
-}
-
-
-/*
- * new_rule_body() - make a new rule named rulename in the given module, with
- * the given argument list and procedure. If exported is true, the rule is
- * exported to the global module as modulename.rulename.
- */
-
-RULE * new_rule_body( module_t * m, OBJECT * rulename, FUNCTION * procedure,
- int exported )
-{
- RULE * const local = define_rule( m, rulename, m );
- local->exported = exported;
- set_rule_body( local, procedure );
-
- /* Mark the procedure with the global rule name, regardless of whether the
- * rule is exported. That gives us something reasonably identifiable that we
- * can use, e.g. in profiling output. Only do this once, since this could be
- * called multiple times with the same procedure.
- */
- if ( !function_rulename( procedure ) )
- function_set_rulename( procedure, global_rule_name( local ) );
-
- return local;
-}
-
-
-static void set_rule_actions( RULE * rule, rule_actions * actions )
-{
- if ( actions )
- actions_refer( actions );
- if ( rule->actions )
- actions_free( rule->actions );
- rule->actions = actions;
-}
-
-
-static rule_actions * actions_new( FUNCTION * command, LIST * bindlist,
- int flags )
-{
- rule_actions * const result = (rule_actions *)BJAM_MALLOC( sizeof(
- rule_actions ) );
- function_refer( command );
- result->command = command;
- result->bindlist = bindlist;
- result->flags = flags;
- result->reference_count = 0;
- return result;
-}
-
-
-RULE * new_rule_actions( module_t * m, OBJECT * rulename, FUNCTION * command,
- LIST * bindlist, int flags )
-{
- RULE * const local = define_rule( m, rulename, m );
- RULE * const global = global_rule( local );
- set_rule_actions( local, actions_new( command, bindlist, flags ) );
- set_rule_actions( global, local->actions );
- return local;
-}
-
-
-/*
- * Looks for a rule in the specified module, and returns it, if found. First
- * checks if the rule is present in the module's rule table. Second, if the
- * rule's name is in the form name1.name2 and name1 is in the list of imported
- * modules, look in module 'name1' for rule 'name2'.
- */
-
-RULE * lookup_rule( OBJECT * rulename, module_t * m, int local_only )
-{
- RULE * r;
- RULE * result = 0;
- module_t * original_module = m;
-
- if ( m->class_module )
- m = m->class_module;
-
- if ( m->rules && ( r = (RULE *)hash_find( m->rules, rulename ) ) )
- result = r;
- else if ( !local_only && m->imported_modules )
- {
- /* Try splitting the name into module and rule. */
- char * p = strchr( object_str( rulename ), '.' ) ;
- if ( p )
- {
- /* Now, r->name keeps the module name, and p + 1 keeps the rule
- * name.
- */
- OBJECT * rule_part = object_new( p + 1 );
- OBJECT * module_part;
- {
- string buf[ 1 ];
- string_new( buf );
- string_append_range( buf, object_str( rulename ), p );
- module_part = object_new( buf->value );
- string_free( buf );
- }
- if ( hash_find( m->imported_modules, module_part ) )
- result = lookup_rule( rule_part, bindmodule( module_part ), 1 );
- object_free( module_part );
- object_free( rule_part );
- }
- }
-
- if ( result )
- {
- if ( local_only && !result->exported )
- result = 0;
- else if ( original_module != m )
- {
- /* Lookup started in class module. We have found a rule in class
- * module, which is marked for execution in that module, or in some
- * instance. Mark it for execution in the instance where we started
- * the lookup.
- */
- int const execute_in_class = result->module == m;
- int const execute_in_some_instance =
- result->module->class_module == m;
- if ( execute_in_class || execute_in_some_instance )
- result->module = original_module;
- }
- }
-
- return result;
-}
-
-
-RULE * bindrule( OBJECT * rulename, module_t * m )
-{
- RULE * result = lookup_rule( rulename, m, 0 );
- if ( !result )
- result = lookup_rule( rulename, root_module(), 0 );
- /* We have only one caller, 'evaluate_rule', which will complain about
- * calling an undefined rule. We could issue the error here, but we do not
- * have the necessary information, such as frame.
- */
- if ( !result )
- result = enter_rule( rulename, m );
- return result;
-}
-
-
-RULE * import_rule( RULE * source, module_t * m, OBJECT * name )
-{
- RULE * const dest = define_rule( source->module, name, m );
- set_rule_body( dest, source->procedure );
- set_rule_actions( dest, source->actions );
- return dest;
-}
-
-
-void rule_localize( RULE * rule, module_t * m )
-{
- rule->module = m;
- if ( rule->procedure )
- {
- FUNCTION * procedure = function_unbind_variables( rule->procedure );
- function_refer( procedure );
- function_free( rule->procedure );
- rule->procedure = procedure;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/jam-files/engine/rules.h
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/jam-files/engine/rules.h b/ext/kenlm/jam-files/engine/rules.h
deleted file mode 100644
index fe2792f..0000000
--- a/ext/kenlm/jam-files/engine/rules.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 1993, 1995 Christopher Seiwald.
- *
- * This file is part of Jam - see jam.c for Copyright information.
- */
-
-/* This file is ALSO:
- * Copyright 2001-2004 David Abrahams.
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying file LICENSE_1_0.txt or http://www.boost.org/LICENSE_1_0.txt)
- */
-
-/*
- * rules.h - targets, rules, and related information
- *
- * This file describes the structures holding the targets, rules, and related
- * information accumulated by interpreting the statements of the jam files.
- *
- * The following are defined:
- *
- * RULE - a generic jam rule, the product of RULE and ACTIONS.
- * ACTIONS - a chain of ACTIONs.
- * ACTION - a RULE instance with targets and sources.
- * SETTINGS - variables to set when executing a TARGET's ACTIONS.
- * TARGETS - a chain of TARGETs.
- * TARGET - an entity (e.g. a file) that can be built.
- */
-
-#ifndef RULES_DWA_20011020_H
-#define RULES_DWA_20011020_H
-
-#include "function.h"
-#include "modules.h"
-#include "timestamp.h"
-
-
-typedef struct _rule RULE;
-typedef struct _target TARGET;
-typedef struct _targets TARGETS;
-typedef struct _action ACTION;
-typedef struct _actions ACTIONS;
-typedef struct _settings SETTINGS ;
-
-/* RULE - a generic jam rule, the product of RULE and ACTIONS. */
-
-/* Build actions corresponding to a rule. */
-struct rule_actions
-{
- int reference_count;
- FUNCTION * command; /* command string from ACTIONS */
- LIST * bindlist;
- int flags; /* modifiers on ACTIONS */
-
-#define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */
-#define RULE_TOGETHER 0x02 /* combine actions on single target */
-#define RULE_IGNORE 0x04 /* ignore return status of executes */
-#define RULE_QUIETLY 0x08 /* do not mention it unless verbose */
-#define RULE_PIECEMEAL 0x10 /* split exec so each $(>) is small */
-#define RULE_EXISTING 0x20 /* $(>) is pre-exisitng sources only */
-};
-
-typedef struct rule_actions rule_actions;
-typedef struct argument_list argument_list;
-
-struct _rule
-{
- OBJECT * name;
- FUNCTION * procedure;
- rule_actions * actions; /* build actions, or NULL for no actions */
- module_t * module; /* module in which this rule is executed */
- int exported; /* nonzero if this rule is supposed to appear in
- * the global module and be automatically
- * imported into other modules
- */
-};
-
-/* ACTIONS - a chain of ACTIONs. */
-struct _actions
-{
- ACTIONS * next;
- ACTIONS * tail; /* valid only for head */
- ACTION * action;
-};
-
-/* ACTION - a RULE instance with targets and sources. */
-struct _action
-{
- RULE * rule;
- TARGETS * targets;
- TARGETS * sources; /* aka $(>) */
- char running; /* has been started */
-#define A_INIT 0
-#define A_RUNNING_NOEXEC 1
-#define A_RUNNING 2
- char status; /* see TARGET status */
- int refs;
-};
-
-/* SETTINGS - variables to set when executing a TARGET's ACTIONS. */
-struct _settings
-{
- SETTINGS * next;
- OBJECT * symbol; /* symbol name for var_set() */
- LIST * value; /* symbol value for var_set() */
-};
-
-/* TARGETS - a chain of TARGETs. */
-struct _targets
-{
- TARGETS * next;
- TARGETS * tail; /* valid only for head */
- TARGET * target;
-};
-
-/* TARGET - an entity (e.g. a file) that can be built. */
-struct _target
-{
- OBJECT * name;
- OBJECT * boundname; /* if search() relocates target */
- ACTIONS * actions; /* rules to execute, if any */
- SETTINGS * settings; /* variables to define */
-
- short flags; /* status info */
-
-#define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */
-#define T_FLAG_NOCARE 0x0002 /* NOCARE applied */
-#define T_FLAG_NOTFILE 0x0004 /* NOTFILE applied */
-#define T_FLAG_TOUCHED 0x0008 /* ALWAYS applied or -t target */
-#define T_FLAG_LEAVES 0x0010 /* LEAVES applied */
-#define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */
-#define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */
-
-/* This flag has been added to support a new built-in rule named "RMBAD". It is
- * used to force removal of outdated targets whose dependencies fail to build.
- */
-#define T_FLAG_RMOLD 0x0080 /* RMBAD applied */
-
-/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
- * to indicate that the result of running a given action should be inverted,
- * i.e. ok <=> fail. Useful for launching certain test runs from a Jamfile.
- */
-#define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */
-
-#define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */
-
-/* Indicates that the target must be a file. Prevents matching non-files, like
- * directories, when a target is searched.
- */
-#define T_FLAG_ISFILE 0x0400
-
-#define T_FLAG_PRECIOUS 0x0800
-
- char binding; /* how target relates to a real file or
- * folder
- */
-
-#define T_BIND_UNBOUND 0 /* a disembodied name */
-#define T_BIND_MISSING 1 /* could not find real file */
-#define T_BIND_PARENTS 2 /* using parent's timestamp */
-#define T_BIND_EXISTS 3 /* real file, timestamp valid */
-
- TARGETS * depends; /* dependencies */
- TARGETS * dependants; /* the inverse of dependencies */
- TARGETS * rebuilds; /* targets that should be force-rebuilt
- * whenever this one is
- */
- TARGET * includes; /* internal includes node */
- TARGET * original_target; /* original_target->includes = this */
- char rescanned;
-
- timestamp time; /* update time */
- timestamp leaf; /* update time of leaf sources */
-
- char fate; /* make0()'s diagnosis */
-
-#define T_FATE_INIT 0 /* nothing done to target */
-#define T_FATE_MAKING 1 /* make0(target) on stack */
-
-#define T_FATE_STABLE 2 /* target did not need updating */
-#define T_FATE_NEWER 3 /* target newer than parent */
-
-#define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */
-#define T_FATE_ISTMP 4 /* unneeded temp target oddly present */
-
-#define T_FATE_BUILD 5 /* >= BUILD rebuilds target */
-#define T_FATE_TOUCHED 5 /* manually touched with -t */
-#define T_FATE_REBUILD 6
-#define T_FATE_MISSING 7 /* is missing, needs updating */
-#define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */
-#define T_FATE_OUTDATED 9 /* is out of date, needs updating */
-#define T_FATE_UPDATE 10 /* deps updated, needs updating */
-
-#define T_FATE_BROKEN 11 /* >= BROKEN ruins parents */
-#define T_FATE_CANTFIND 11 /* no rules to make missing target */
-#define T_FATE_CANTMAKE 12 /* can not find dependencies */
-
- char progress; /* tracks make1() progress */
-
-#define T_MAKE_INIT 0 /* make1(target) not yet called */
-#define T_MAKE_ONSTACK 1 /* make1(target) on stack */
-#define T_MAKE_ACTIVE 2 /* make1(target) in make1b() */
-#define T_MAKE_RUNNING 3 /* make1(target) running commands */
-#define T_MAKE_DONE 4 /* make1(target) done */
-#define T_MAKE_NOEXEC_DONE 5 /* make1(target) done with -n in effect */
-
-#ifdef OPT_SEMAPHORE
- #define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
-#endif
-
-#ifdef OPT_SEMAPHORE
- TARGET * semaphore; /* used in serialization */
-#endif
-
- char status; /* exec_cmd() result */
-
- int asynccnt; /* child deps outstanding */
- TARGETS * parents; /* used by make1() for completion */
- TARGET * scc_root; /* used by make to resolve cyclic includes
- */
- TARGET * rescanning; /* used by make0 to mark visited targets
- * when rescanning
- */
- int depth; /* The depth of the target in the make0
- * stack.
- */
- char * cmds; /* type-punned command list */
-
- char const * failed;
-};
-
-
-/* Action related functions. */
-void action_free ( ACTION * );
-ACTIONS * actionlist ( ACTIONS *, ACTION * );
-void freeactions ( ACTIONS * );
-SETTINGS * addsettings ( SETTINGS *, int flag, OBJECT * symbol, LIST * value );
-void pushsettings ( module_t *, SETTINGS * );
-void popsettings ( module_t *, SETTINGS * );
-SETTINGS * copysettings ( SETTINGS * );
-void freesettings ( SETTINGS * );
-void actions_refer( rule_actions * );
-void actions_free ( rule_actions * );
-
-/* Rule related functions. */
-RULE * bindrule ( OBJECT * rulename, module_t * );
-RULE * import_rule ( RULE * source, module_t *, OBJECT * name );
-void rule_localize ( RULE * rule, module_t * module );
-RULE * new_rule_body ( module_t *, OBJECT * rulename, FUNCTION * func, int exprt );
-RULE * new_rule_actions( module_t *, OBJECT * rulename, FUNCTION * command, LIST * bindlist, int flags );
-void rule_free ( RULE * );
-
-/* Target related functions. */
-void bind_explicitly_located_targets();
-TARGET * bindtarget ( OBJECT * const );
-void freetargets ( TARGETS * );
-TARGETS * targetchain ( TARGETS *, TARGETS * );
-TARGETS * targetentry ( TARGETS *, TARGET * );
-void target_include ( TARGET * const including,
- TARGET * const included );
-void target_include_many ( TARGET * const including,
- LIST * const included_names );
-TARGETS * targetlist ( TARGETS *, LIST * target_names );
-void touch_target ( OBJECT * const );
-void clear_includes ( TARGET * );
-TARGET * target_scc ( TARGET * );
-
-/* Final module cleanup. */
-void rules_done();
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/jam-files/engine/scan.c
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/jam-files/engine/scan.c b/ext/kenlm/jam-files/engine/scan.c
deleted file mode 100644
index d92fdca..0000000
--- a/ext/kenlm/jam-files/engine/scan.c
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
- *
- * This file is part of Jam - see jam.c for Copyright information.
- */
-
-/*
- * scan.c - the jam yacc scanner
- *
- */
-
-#include "jam.h"
-#include "scan.h"
-
-#include "constants.h"
-#include "jambase.h"
-#include "jamgram.h"
-
-
-struct keyword
-{
- char * word;
- int type;
-} keywords[] =
-{
-#include "jamgramtab.h"
- { 0, 0 }
-};
-
-typedef struct include include;
-struct include
-{
- include * next; /* next serial include file */
- char * string; /* pointer into current line */
- char * * strings; /* for yyfparse() -- text to parse */
- FILE * file; /* for yyfparse() -- file being read */
- OBJECT * fname; /* for yyfparse() -- file name */
- int line; /* line counter for error messages */
- char buf[ 512 ]; /* for yyfparse() -- line buffer */
-};
-
-static include * incp = 0; /* current file; head of chain */
-
-static int scanmode = SCAN_NORMAL;
-static int anyerrors = 0;
-
-
-static char * symdump( YYSTYPE * );
-
-#define BIGGEST_TOKEN 10240 /* no single token can be larger */
-
-
-/*
- * Set parser mode: normal, string, or keyword.
- */
-
-void yymode( int n )
-{
- scanmode = n;
-}
-
-
-void yyerror( char const * s )
-{
- /* We use yylval instead of incp to access the error location information as
- * the incp pointer will already be reset to 0 in case the error occurred at
- * EOF.
- *
- * The two may differ only if ran into an unexpected EOF or we get an error
- * while reading a lexical token spanning multiple lines, e.g. a multi-line
- * string literal or action body, in which case yylval location information
- * will hold the information about where the token started while incp will
- * hold the information about where reading it broke.
- */
- printf( "%s:%d: %s at %s\n", object_str( yylval.file ), yylval.line, s,
- symdump( &yylval ) );
- ++anyerrors;
-}
-
-
-int yyanyerrors()
-{
- return anyerrors != 0;
-}
-
-
-void yyfparse( OBJECT * s )
-{
- include * i = (include *)BJAM_MALLOC( sizeof( *i ) );
-
- /* Push this onto the incp chain. */
- i->string = "";
- i->strings = 0;
- i->file = 0;
- i->fname = object_copy( s );
- i->line = 0;
- i->next = incp;
- incp = i;
-
- /* If the filename is "+", it means use the internal jambase. */
- if ( !strcmp( object_str( s ), "+" ) )
- i->strings = jambase;
-}
-
-
-/*
- * yyline() - read new line and return first character.
- *
- * Fabricates a continuous stream of characters across include files, returning
- * EOF at the bitter end.
- */
-
-int yyline()
-{
- include * const i = incp;
-
- if ( !incp )
- return EOF;
-
- /* Once we start reading from the input stream, we reset the include
- * insertion point so that the next include file becomes the head of the
- * list.
- */
-
- /* If there is more data in this line, return it. */
- if ( *i->string )
- return *i->string++;
-
- /* If we are reading from an internal string list, go to the next string. */
- if ( i->strings )
- {
- if ( *i->strings )
- {
- ++i->line;
- i->string = *(i->strings++);
- return *i->string++;
- }
- }
- else
- {
- /* If necessary, open the file. */
- if ( !i->file )
- {
- FILE * f = stdin;
- if ( strcmp( object_str( i->fname ), "-" ) && !( f = fopen( object_str( i->fname ), "r" ) ) )
- perror( object_str( i->fname ) );
- i->file = f;
- }
-
- /* If there is another line in this file, start it. */
- if ( i->file && fgets( i->buf, sizeof( i->buf ), i->file ) )
- {
- ++i->line;
- i->string = i->buf;
- return *i->string++;
- }
- }
-
- /* This include is done. Free it up and return EOF so yyparse() returns to
- * parse_file().
- */
-
- incp = i->next;
-
- /* Close file, free name. */
- if ( i->file && ( i->file != stdin ) )
- fclose( i->file );
- object_free( i->fname );
- BJAM_FREE( (char *)i );
-
- return EOF;
-}
-
-
-/*
- * yylex() - set yylval to current token; return its type.
- *
- * Macros to move things along:
- *
- * yychar() - return and advance character; invalid after EOF.
- * yyprev() - back up one character; invalid before yychar().
- *
- * yychar() returns a continuous stream of characters, until it hits the EOF of
- * the current include file.
- */
-
-#define yychar() ( *incp->string ? *incp->string++ : yyline() )
-#define yyprev() ( incp->string-- )
-
-int yylex()
-{
- int c;
- char buf[ BIGGEST_TOKEN ];
- char * b = buf;
-
- if ( !incp )
- goto eof;
-
- /* Get first character (whitespace or of token). */
- c = yychar();
-
- if ( scanmode == SCAN_STRING )
- {
- /* If scanning for a string (action's {}'s), look for the closing brace.
- * We handle matching braces, if they match.
- */
-
- int nest = 1;
-
- while ( ( c != EOF ) && ( b < buf + sizeof( buf ) ) )
- {
- if ( c == '{' )
- ++nest;
-
- if ( ( c == '}' ) && !--nest )
- break;
-
- *b++ = c;
-
- c = yychar();
-
- /* Turn trailing "\r\n" sequences into plain "\n" for Cygwin. */
- if ( ( c == '\n' ) && ( b[ -1 ] == '\r' ) )
- --b;
- }
-
- /* We ate the ending brace -- regurgitate it. */
- if ( c != EOF )
- yyprev();
-
- /* Check for obvious errors. */
- if ( b == buf + sizeof( buf ) )
- {
- yyerror( "action block too big" );
- goto eof;
- }
-
- if ( nest )
- {
- yyerror( "unmatched {} in action block" );
- goto eof;
- }
-
- *b = 0;
- yylval.type = STRING;
- yylval.string = object_new( buf );
- yylval.file = incp->fname;
- yylval.line = incp->line;
- }
- else
- {
- char * b = buf;
- struct keyword * k;
- int inquote = 0;
- int notkeyword;
-
- /* Eat white space. */
- for ( ; ; )
- {
- /* Skip past white space. */
- while ( ( c != EOF ) && isspace( c ) )
- c = yychar();
-
- /* Not a comment? */
- if ( c != '#' )
- break;
-
- /* Swallow up comment line. */
- while ( ( ( c = yychar() ) != EOF ) && ( c != '\n' ) ) ;
- }
-
- /* c now points to the first character of a token. */
- if ( c == EOF )
- goto eof;
-
- yylval.file = incp->fname;
- yylval.line = incp->line;
-
- /* While scanning the word, disqualify it for (expensive) keyword lookup
- * when we can: $anything, "anything", \anything
- */
- notkeyword = c == '$';
-
- /* Look for white space to delimit word. "'s get stripped but preserve
- * white space. \ protects next character.
- */
- while
- (
- ( c != EOF ) &&
- ( b < buf + sizeof( buf ) ) &&
- ( inquote || !isspace( c ) )
- )
- {
- if ( c == '"' )
- {
- /* begin or end " */
- inquote = !inquote;
- notkeyword = 1;
- }
- else if ( c != '\\' )
- {
- /* normal char */
- *b++ = c;
- }
- else if ( ( c = yychar() ) != EOF )
- {
- /* \c */
- if (c == 'n')
- c = '\n';
- else if (c == 'r')
- c = '\r';
- else if (c == 't')
- c = '\t';
- *b++ = c;
- notkeyword = 1;
- }
- else
- {
- /* \EOF */
- break;
- }
-
- c = yychar();
- }
-
- /* Check obvious errors. */
- if ( b == buf + sizeof( buf ) )
- {
- yyerror( "string too big" );
- goto eof;
- }
-
- if ( inquote )
- {
- yyerror( "unmatched \" in string" );
- goto eof;
- }
-
- /* We looked ahead a character - back up. */
- if ( c != EOF )
- yyprev();
-
- /* Scan token table. Do not scan if it is obviously not a keyword or if
- * it is an alphabetic when were looking for punctuation.
- */
-
- *b = 0;
- yylval.type = ARG;
-
- if ( !notkeyword && !( isalpha( *buf ) && ( scanmode == SCAN_PUNCT ) ) )
- for ( k = keywords; k->word; ++k )
- if ( ( *buf == *k->word ) && !strcmp( k->word, buf ) )
- {
- yylval.type = k->type;
- yylval.keyword = k->word; /* used by symdump */
- break;
- }
-
- if ( yylval.type == ARG )
- yylval.string = object_new( buf );
- }
-
- if ( DEBUG_SCAN )
- printf( "scan %s\n", symdump( &yylval ) );
-
- return yylval.type;
-
-eof:
- /* We do not reset yylval.file & yylval.line here so unexpected EOF error
- * messages would include correct error location information.
- */
- yylval.type = EOF;
- return yylval.type;
-}
-
-
-static char * symdump( YYSTYPE * s )
-{
- static char buf[ BIGGEST_TOKEN + 20 ];
- switch ( s->type )
- {
- case EOF : sprintf( buf, "EOF" ); break;
- case 0 : sprintf( buf, "unknown symbol %s", object_str( s->string ) ); break;
- case ARG : sprintf( buf, "argument %s" , object_str( s->string ) ); break;
- case STRING: sprintf( buf, "string \"%s\"" , object_str( s->string ) ); break;
- default : sprintf( buf, "keyword %s" , s->keyword ); break;
- }
- return buf;
-}
-
-
-/*
- * Get information about the current file and line, for those epsilon
- * transitions that produce a parse.
- */
-
-void yyinput_last_read_token( OBJECT * * name, int * line )
-{
- /* TODO: Consider whether and when we might want to report where the last
- * read token ended, e.g. EOF errors inside string literals.
- */
- *name = yylval.file;
- *line = yylval.line;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/jam-files/engine/scan.h
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/jam-files/engine/scan.h b/ext/kenlm/jam-files/engine/scan.h
deleted file mode 100644
index 745477f..0000000
--- a/ext/kenlm/jam-files/engine/scan.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 1993, 1995 Christopher Seiwald.
- *
- * This file is part of Jam - see jam.c for Copyright information.
- */
-
-/*
- * scan.h - the jam yacc scanner
- *
- * External functions:
- * yyerror( char *s ) - print a parsing error message.
- * yyfparse( char *s ) - scan include file s.
- * yylex() - parse the next token, returning its type.
- * yymode() - adjust lexicon of scanner.
- * yyparse() - declaration for yacc parser.
- * yyanyerrors() - indicate if any parsing errors occured.
- *
- * The yymode() function is for the parser to adjust the lexicon of the scanner.
- * Aside from normal keyword scanning, there is a mode to handle action strings
- * (look only for the closing }) and a mode to ignore most keywords when looking
- * for a punctuation keyword. This allows non-punctuation keywords to be used in
- * lists without quoting.
- */
-
-#include "lists.h"
-#include "object.h"
-#include "parse.h"
-
-
-/*
- * YYSTYPE - value of a lexical token
- */
-
-#define YYSTYPE YYSYMBOL
-
-typedef struct _YYSTYPE
-{
- int type;
- OBJECT * string;
- PARSE * parse;
- LIST * list;
- int number;
- OBJECT * file;
- int line;
- char const * keyword;
-} YYSTYPE;
-
-extern YYSTYPE yylval;
-
-void yymode( int n );
-void yyerror( char const * s );
-int yyanyerrors();
-void yyfparse( OBJECT * s );
-int yyline();
-int yylex();
-int yyparse();
-void yyinput_last_read_token( OBJECT * * name, int * line );
-
-#define SCAN_NORMAL 0 /* normal parsing */
-#define SCAN_STRING 1 /* look only for matching } */
-#define SCAN_PUNCT 2 /* only punctuation keywords */