You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by df...@apache.org on 2002/01/25 10:32:29 UTC

cvs commit: jakarta-oro/src/java/org/apache/oro/text/regex Perl5Matcher.java

dfs         02/01/25 01:32:28

  Modified:    .        CHANGES
               src/java/org/apache/oro/text/regex Perl5Matcher.java
  Added:       src/java/examples/awk strings.java
  Log:
  Added strings.java example program and noted change.  Made minor formatting
  change in Perl5Matcher.
  
  Revision  Changes    Path
  1.22      +7 -1      jakarta-oro/CHANGES
  
  Index: CHANGES
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/CHANGES,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -r1.21 -r1.22
  --- CHANGES	17 Jan 2002 19:30:28 -0000	1.21
  +++ CHANGES	25 Jan 2002 09:32:28 -0000	1.22
  @@ -1,4 +1,10 @@
  -$Id: CHANGES,v 1.21 2002/01/17 19:30:28 dfs Exp $
  +$Id: CHANGES,v 1.22 2002/01/25 09:32:28 dfs Exp $
  +
  +Version 2.0.x
  +
  +o Added a strings.java example to the awk examples, better demonstrating
  +  the character encoding issues associated with AwkMatcher's 8-bit
  +  character set limitation.
   
   Version 2.0.5
   
  
  
  
  1.1                  jakarta-oro/src/java/examples/awk/strings.java
  
  Index: strings.java
  ===================================================================
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
   *    must not be used to endorse or promote products derived from this
   *    software without prior written permission. For written
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache" 
   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
   *    name, without prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /*
   * $Id: strings.java,v 1.1 2002/01/25 09:32:28 dfs Exp $
   */
  import java.io.*;
  import org.apache.oro.text.regex.*;
  import org.apache.oro.text.awk.*;
  
  /**
   * This is a test program demonstrating how to search an input stream
   * with the jakarta-oro awk package regular expression classes.  It
   * performs a function similar to the Unix <code>strings</code> command,
   * but is intended to show how matching on a stream is affected by its
   * character encoding.  The most important thing to remember is that
   * AwkMatcher only matches on 8-bit values.  If your input contains
   * Java characters containing values greater than 255, the pattern
   * matching process will result in an ArrayIndexOutOfBoundsException.
   * Therefore, if you want to search a binary file containing arbitrary
   * bytes, you have to make sure you use an 8-bit character encoding
   * like ISO-8859-1, so that the mapping between byte-values and character
   * values will be one to one.  Otherwise, the file will be interpreted
   * as UTF-8 by default, and you will probably wind up with character 
   * values outside of the 8-bit range.
   *
   * @author <a href="mailto:oro-dev@jakarta.apache.org">Daniel F. Savarese</a>
   * @author <a href="mailto:oro-dev@jakarta.apache.org">David Lee Wilson</a>
   * @version @version@
   */
  public final class strings {
  
    public static final class StringFinder {
      /**
       * Default string expression.  Looks for at least 4 contiguous
       * printable characters.  Differs slightly from GNU strings command
       * in that any printable character may start a string.
       */
      public static final String DEFAULT_PATTERN =
        "[\\x20-\\x7E]{3}[\\x20-\\x7E]+";
  
      Pattern pattern;
      AwkMatcher matcher;
  
      public StringFinder(String regex) throws MalformedPatternException {
        AwkCompiler compiler = new AwkCompiler();
        pattern = compiler.compile(regex, AwkCompiler.CASE_INSENSITIVE_MASK);
        matcher = new AwkMatcher();
      }
  
      public StringFinder() throws MalformedPatternException {
        this(DEFAULT_PATTERN);
      }
  
      public void search(Reader input, PrintWriter output) throws IOException {
        MatchResult result;
        AwkStreamInput in = new AwkStreamInput(input);
  
        while(matcher.contains(in, pattern)) {
          result = matcher.getMatch();  
          output.println(result);
        }
        output.flush();
      }
    }
  
  
    public static final String DEFAULT_ENCODING = "ISO-8859-1";
  
    public static final void main(String args[]) {
      String regex = StringFinder.DEFAULT_PATTERN;
      String filename, encoding = DEFAULT_ENCODING;
      StringFinder finder;
      Reader file = null;
  
      // Some users thought it would be useful to use the default pattern
      // and just pass the encoding as the second parameter.  Therefore,
      // when two arguments are given and the second argument is not a valid
      // encoding, it is interpreted as a pattern.  This means you can't
      // use a valid encoding name as a pattern without also specifying
      // an encoding as a third argument.
      if(args.length < 1) {
        System.err.println("usage: strings file [pattern|encoding] [encoding]");
        return;
      } else if(args.length > 2) {
        regex = args[1];
        encoding = args[2];
      } else if(args.length > 1)
        encoding = args[1];
  
      filename = args[0];
  
      try {
        InputStream fin = new FileInputStream(filename);
  
        try {
          file = new InputStreamReader(fin, encoding);
        } catch(UnsupportedEncodingException uee) {
          if(args.length == 2) {
            regex    = encoding;
  	  encoding = DEFAULT_ENCODING;
  	  file     = new InputStreamReader(fin, encoding);
  	} else
  	  throw uee;
        }
  
        finder = new StringFinder(regex);
        finder.search(file, new PrintWriter(new OutputStreamWriter(System.out)));
        file.close();
      } catch(Exception e) {
        e.printStackTrace();
        return;
      }
    }
  }
  
  
  
  1.18      +5 -5      jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java
  
  Index: Perl5Matcher.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -r1.17 -r1.18
  --- Perl5Matcher.java	14 Nov 2001 02:39:37 -0000	1.17
  +++ Perl5Matcher.java	25 Jan 2002 09:32:28 -0000	1.18
  @@ -59,7 +59,7 @@
   
   
   /*
  - * $Id: Perl5Matcher.java,v 1.17 2001/11/14 02:39:37 dfs Exp $
  + * $Id: Perl5Matcher.java,v 1.18 2002/01/25 09:32:28 dfs Exp $
    */
   import java.io.IOException;
   import java.util.*;
  @@ -261,8 +261,8 @@
         offs = __beginMatchOffsets[__numParentheses];
   
         if(offs >= 0)
  -	__lastMatchResult._beginGroupOffset[__numParentheses]
  -	  = offs - __lastMatchResult._matchBeginOffset;
  +	__lastMatchResult._beginGroupOffset[__numParentheses] =
  +	  offs - __lastMatchResult._matchBeginOffset;
         else
   	__lastMatchResult._beginGroupOffset[__numParentheses] =
   	  OpCode._NULL_OFFSET;
  @@ -270,8 +270,8 @@
         offs = __endMatchOffsets[__numParentheses];
   
         if(offs >= 0)
  -	__lastMatchResult._endGroupOffset[__numParentheses]
  -	  = offs - __lastMatchResult._matchBeginOffset;
  +	__lastMatchResult._endGroupOffset[__numParentheses] =
  +	  offs - __lastMatchResult._matchBeginOffset;
         else
   	__lastMatchResult._endGroupOffset[__numParentheses] =
   	  OpCode._NULL_OFFSET;
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>