You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by df...@apache.org on 2002/01/25 10:32:29 UTC
cvs commit: jakarta-oro/src/java/org/apache/oro/text/regex Perl5Matcher.java
dfs 02/01/25 01:32:28
Modified: . CHANGES
src/java/org/apache/oro/text/regex Perl5Matcher.java
Added: src/java/examples/awk strings.java
Log:
Added strings.java example program and noted change. Made minor formatting
change in Perl5Matcher.
Revision Changes Path
1.22 +7 -1 jakarta-oro/CHANGES
Index: CHANGES
===================================================================
RCS file: /home/cvs/jakarta-oro/CHANGES,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -r1.21 -r1.22
--- CHANGES 17 Jan 2002 19:30:28 -0000 1.21
+++ CHANGES 25 Jan 2002 09:32:28 -0000 1.22
@@ -1,4 +1,10 @@
-$Id: CHANGES,v 1.21 2002/01/17 19:30:28 dfs Exp $
+$Id: CHANGES,v 1.22 2002/01/25 09:32:28 dfs Exp $
+
+Version 2.0.x
+
+o Added a strings.java example to the awk examples, better demonstrating
+ the character encoding issues associated with AwkMatcher's 8-bit
+ character set limitation.
Version 2.0.5
1.1 jakarta-oro/src/java/examples/awk/strings.java
Index: strings.java
===================================================================
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
* must not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
* name, without prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Id: strings.java,v 1.1 2002/01/25 09:32:28 dfs Exp $
*/
import java.io.*;
import org.apache.oro.text.regex.*;
import org.apache.oro.text.awk.*;
/**
* This is a test program demonstrating how to search an input stream
* with the jakarta-oro awk package regular expression classes. It
* performs a function similar to the Unix <code>strings</code> command,
* but is intended to show how matching on a stream is affected by its
* character encoding. The most important thing to remember is that
* AwkMatcher only matches on 8-bit values. If your input contains
* Java characters containing values greater than 255, the pattern
* matching process will result in an ArrayIndexOutOfBoundsException.
* Therefore, if you want to search a binary file containing arbitrary
* bytes, you have to make sure you use an 8-bit character encoding
* like ISO-8859-1, so that the mapping between byte-values and character
* values will be one to one. Otherwise, the file will be interpreted
* as UTF-8 by default, and you will probably wind up with character
* values outside of the 8-bit range.
*
* @author <a href="mailto:oro-dev@jakarta.apache.org">Daniel F. Savarese</a>
* @author <a href="mailto:oro-dev@jakarta.apache.org">David Lee Wilson</a>
* @version @version@
*/
public final class strings {
public static final class StringFinder {
/**
* Default string expression. Looks for at least 4 contiguous
* printable characters. Differs slightly from GNU strings command
* in that any printable character may start a string.
*/
public static final String DEFAULT_PATTERN =
"[\\x20-\\x7E]{3}[\\x20-\\x7E]+";
Pattern pattern;
AwkMatcher matcher;
public StringFinder(String regex) throws MalformedPatternException {
AwkCompiler compiler = new AwkCompiler();
pattern = compiler.compile(regex, AwkCompiler.CASE_INSENSITIVE_MASK);
matcher = new AwkMatcher();
}
public StringFinder() throws MalformedPatternException {
this(DEFAULT_PATTERN);
}
public void search(Reader input, PrintWriter output) throws IOException {
MatchResult result;
AwkStreamInput in = new AwkStreamInput(input);
while(matcher.contains(in, pattern)) {
result = matcher.getMatch();
output.println(result);
}
output.flush();
}
}
public static final String DEFAULT_ENCODING = "ISO-8859-1";
public static final void main(String args[]) {
String regex = StringFinder.DEFAULT_PATTERN;
String filename, encoding = DEFAULT_ENCODING;
StringFinder finder;
Reader file = null;
// Some users thought it would be useful to use the default pattern
// and just pass the encoding as the second parameter. Therefore,
// when two arguments are given and the second argument is not a valid
// encoding, it is interpreted as a pattern. This means you can't
// use a valid encoding name as a pattern without also specifying
// an encoding as a third argument.
if(args.length < 1) {
System.err.println("usage: strings file [pattern|encoding] [encoding]");
return;
} else if(args.length > 2) {
regex = args[1];
encoding = args[2];
} else if(args.length > 1)
encoding = args[1];
filename = args[0];
try {
InputStream fin = new FileInputStream(filename);
try {
file = new InputStreamReader(fin, encoding);
} catch(UnsupportedEncodingException uee) {
if(args.length == 2) {
regex = encoding;
encoding = DEFAULT_ENCODING;
file = new InputStreamReader(fin, encoding);
} else
throw uee;
}
finder = new StringFinder(regex);
finder.search(file, new PrintWriter(new OutputStreamWriter(System.out)));
file.close();
} catch(Exception e) {
e.printStackTrace();
return;
}
}
}
1.18 +5 -5 jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java
Index: Perl5Matcher.java
===================================================================
RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -r1.17 -r1.18
--- Perl5Matcher.java 14 Nov 2001 02:39:37 -0000 1.17
+++ Perl5Matcher.java 25 Jan 2002 09:32:28 -0000 1.18
@@ -59,7 +59,7 @@
/*
- * $Id: Perl5Matcher.java,v 1.17 2001/11/14 02:39:37 dfs Exp $
+ * $Id: Perl5Matcher.java,v 1.18 2002/01/25 09:32:28 dfs Exp $
*/
import java.io.IOException;
import java.util.*;
@@ -261,8 +261,8 @@
offs = __beginMatchOffsets[__numParentheses];
if(offs >= 0)
- __lastMatchResult._beginGroupOffset[__numParentheses]
- = offs - __lastMatchResult._matchBeginOffset;
+ __lastMatchResult._beginGroupOffset[__numParentheses] =
+ offs - __lastMatchResult._matchBeginOffset;
else
__lastMatchResult._beginGroupOffset[__numParentheses] =
OpCode._NULL_OFFSET;
@@ -270,8 +270,8 @@
offs = __endMatchOffsets[__numParentheses];
if(offs >= 0)
- __lastMatchResult._endGroupOffset[__numParentheses]
- = offs - __lastMatchResult._matchBeginOffset;
+ __lastMatchResult._endGroupOffset[__numParentheses] =
+ offs - __lastMatchResult._matchBeginOffset;
else
__lastMatchResult._endGroupOffset[__numParentheses] =
OpCode._NULL_OFFSET;
--
To unsubscribe, e-mail: <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>