You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/04/23 15:27:49 UTC

svn commit: r1470944 - /ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/util/DependencyRegex.java

Author: tmill
Date: Tue Apr 23 13:27:48 2013
New Revision: 1470944

URL: http://svn.apache.org/r1470944
Log:
Added DependencyRegex class back in.

Added:
    ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/util/DependencyRegex.java

Added: ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/util/DependencyRegex.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/util/DependencyRegex.java?rev=1470944&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/util/DependencyRegex.java (added)
+++ ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/util/DependencyRegex.java Tue Apr 23 13:27:48 2013
@@ -0,0 +1,322 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dependency.parser.util;
+
+import java.util.Iterator;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+
+import com.googlecode.clearnlp.dependency.DEPNode;
+
+/*import clear.dep.DepLib;
+import clear.dep.DepNode;
+import clear.ftr.FtrLib;
+*/
+
+/**
+ * @author m081914
+ *
+ */
+public class DependencyRegex {
+
+	/** Static identifiers used to build Regex expressions */
+	public static String ANY_TOKEN  = "[^"+Delim.R_TOK_DELIM+"]*"; 
+	public static String ANY_DEPREL = "[^"+Delim.R_REL_DELIM+"]*";
+	public static String ANY_POS    = "\\w*";
+	public static String ANY_NOUN   = "N..?";
+	public static String ANY_VERB   = "V..?";
+	public static String ANY_ADJECTIVE = "J..?";
+	public static String fromSet ( Set<String> okwords ) {
+		StringBuilder str = new StringBuilder();
+		Iterator<String> it = okwords.iterator();
+		if (it.hasNext()) {
+			str.append("((?:"+it.next()+")");
+		}
+		while (it.hasNext()) {
+			str.append("|(?:"+it.next()+")");
+		}
+		if ( str.toString()!="" ) {
+			str.append(")");
+		}
+		return str.toString();
+	}
+
+	private static String L_TOK_DELIM = metaReplace(Delim.L_TOK_DELIM); 
+	private static String R_TOK_DELIM = metaReplace(Delim.R_TOK_DELIM); 
+	private static String L_POS_DELIM = metaReplace(Delim.L_POS_DELIM); 
+	private static String R_POS_DELIM = metaReplace(Delim.R_POS_DELIM); 
+	private static String L_REL_DELIM = metaReplace(Delim.L_REL_DELIM); 
+	private static String R_REL_DELIM = metaReplace(Delim.R_REL_DELIM); 
+	private static String UP_ARC_A    = metaReplace(Delim.UP_ARC_A); 
+	private static String UP_ARC_B    = metaReplace(Delim.UP_ARC_B); 
+	private static String DN_ARC_A    = metaReplace(Delim.DN_ARC_A); 
+	private static String DN_ARC_B    = metaReplace(Delim.DN_ARC_B); 
+
+	private Pattern regex;
+		
+
+	public DependencyRegex() {
+		// TODO Auto-generated constructor stub
+	}
+
+	/**
+	 * @param path a DependencyPath from which to make and/or modify a regex
+	 */
+	public DependencyRegex( DependencyPath path ) {
+		String str = path.toString();
+		Pattern regex = Pattern.compile(str,Pattern.CASE_INSENSITIVE);
+		this.regex = regex;
+	}
+
+	/**
+	 * @param str a string containing the Java-format regular expression to set
+	 */
+	public DependencyRegex( String str ) {
+		this.regex = compile(str,Pattern.CASE_INSENSITIVE);
+	}
+
+	
+	/**
+	 * @param regnodes
+	 * @param commonNodeIndex The index (starting from 1) of the lowest common node in the dependency tree
+	 */
+	public DependencyRegex(DEPNode[] regnodes, int commonNodeIndex) {
+		StringBuilder str = new StringBuilder();
+
+//		str.append(".*");
+		for (int i=0; i<regnodes.length; i++) {
+			if (regnodes[i].form==null)    regnodes[i].form   = ANY_TOKEN;
+			if (regnodes[i].pos==null)     regnodes[i].pos    = ANY_POS;
+//			if (regnodes[i].deprel==FtrLib.TAG_NULL)  regnodes[i].deprel = ANY_DEPREL;
+			if (regnodes[i].getLabel()==null) regnodes[i].setLabel(ANY_DEPREL);
+			
+			if (i==commonNodeIndex-1) {
+				str.append( L_TOK_DELIM + regnodes[i].form + R_TOK_DELIM );
+				str.append( L_POS_DELIM + regnodes[i].pos  + R_POS_DELIM );
+				continue;
+			}
+			str.append( i>=commonNodeIndex-1 ? 
+					DN_ARC_A
+					+ L_REL_DELIM + regnodes[i].getLabel() + R_REL_DELIM
+					+ DN_ARC_B
+					+ L_TOK_DELIM + regnodes[i].form   + R_TOK_DELIM
+					+ L_POS_DELIM + regnodes[i].pos    + R_POS_DELIM
+					: 
+						L_TOK_DELIM + regnodes[i].form  + R_TOK_DELIM
+						+ L_POS_DELIM + regnodes[i].pos + R_POS_DELIM
+						+ UP_ARC_B
+						+ L_REL_DELIM + regnodes[i].getLabel() + R_REL_DELIM
+						+ UP_ARC_A
+			);
+		}
+		
+//		str.append(".*");
+		this.regex = compile(str.toString(),Pattern.CASE_INSENSITIVE);
+	}
+	
+	/**
+	 * @param regnodes
+	 * @param commonNodeIndex The index (starting from 1) of the lowest common node in the dependency tree
+	 */
+	public DependencyRegex(ConllDependencyNode[] regnodes, int commonNodeIndex) {
+		StringBuilder str = new StringBuilder();
+
+//		str.append(".*");
+		for (int i=0; i<regnodes.length; i++) {
+			if (regnodes[i].getForm()==null)   regnodes[i].setForm(ANY_TOKEN);
+			if (regnodes[i].getPostag()==null) regnodes[i].setPostag(ANY_POS);
+			if (regnodes[i].getDeprel()==null) regnodes[i].setDeprel(ANY_DEPREL);
+			if (i==commonNodeIndex-1) {
+				str.append( L_TOK_DELIM + regnodes[i].getForm()   + R_TOK_DELIM );
+				str.append( L_POS_DELIM + regnodes[i].getPostag() + R_POS_DELIM );
+				continue;
+			}
+			str.append( i>=commonNodeIndex-1 ? 
+					DN_ARC_A
+					+ L_REL_DELIM     + regnodes[i].getDeprel() + R_REL_DELIM
+					+ DN_ARC_B
+					+ L_TOK_DELIM     + regnodes[i].getForm()   + R_TOK_DELIM
+					+ L_POS_DELIM     + regnodes[i].getPostag() + R_POS_DELIM
+					: 
+						L_TOK_DELIM   + regnodes[i].getForm()   + R_TOK_DELIM
+						+ L_POS_DELIM + regnodes[i].getPostag() + R_POS_DELIM
+						+ UP_ARC_B
+						+ L_REL_DELIM + regnodes[i].getDeprel() + R_REL_DELIM
+						+ UP_ARC_A
+			);
+		}
+		
+//		str.append(".*");
+		this.regex = compile(str.toString(),Pattern.CASE_INSENSITIVE);
+	}
+	private static String metaReplace( String str ) {
+		str = Pattern.compile("\\{").matcher(str).replaceAll("\\\\{");
+		str = Pattern.compile("\\}").matcher(str).replaceAll("\\\\}");
+		str = Pattern.compile("\\[").matcher(str).replaceAll("\\\\[");
+		str = Pattern.compile("\\]").matcher(str).replaceAll("\\\\]");
+		str = Pattern.compile("\\+").matcher(str).replaceAll("\\\\+");
+		str = Pattern.compile("\\*").matcher(str).replaceAll("\\\\*");
+		str = Pattern.compile("\\(").matcher(str).replaceAll("\\\\(");
+		str = Pattern.compile("\\)").matcher(str).replaceAll("\\\\)");
+		str = Pattern.compile("\\^").matcher(str).replaceAll("\\\\^");
+		str = Pattern.compile("\\$").matcher(str).replaceAll("\\\\$");
+		str = Pattern.compile("\\.").matcher(str).replaceAll("\\\\.");
+		return str;
+	}
+
+	public Pattern compile(String str) {
+		return Pattern.compile(str,Pattern.CASE_INSENSITIVE);
+	}
+
+	public Pattern compile(String str, int flag) {
+		return Pattern.compile(str, flag);
+	}
+
+	/**
+	 * @return the regex
+	 */
+	public String get() {
+		return regex.toString();
+	}
+
+
+	/**
+	 * @return the regex
+	 */
+	public String toString() {
+		return regex.toString();
+	}
+
+	public Matcher matcher( CharSequence input ) {
+		return regex.matcher( input );
+	}
+	
+	public boolean matches( String str ) {
+		Matcher matcher = regex.matcher( str );
+		boolean flag = matcher.matches();
+		if (flag) System.out.println(" matched: "+ matcher.group());
+		return flag;
+	}
+
+	public boolean find( String str ) {
+		Matcher matcher = regex.matcher( str );
+		boolean flag = matcher.find();
+//		if (flag) System.out.println(" found: "+ matcher.group());
+		return flag;
+	}
+
+	public String[] split(CharSequence input) {
+		return regex.split(input);
+	}
+
+//	// Add to DependencyRegexes... especially optional ones
+//	public void append( DependencyRegex dregex ) {
+//		this.regex = compile( this.toString() + dregex.toString(),
+//				Pattern.CASE_INSENSITIVE);
+//	}
+//	
+//	public void appendOptional( DependencyRegex dregex ) {
+//		this.regex = compile( this.toString() + "("+dregex.toString()+")*",
+//				Pattern.CASE_INSENSITIVE);
+//	}
+//
+//	public void prepend( DependencyRegex dregex ) {
+//		this.regex = compile( dregex.toString() + this.toString(),
+//				Pattern.CASE_INSENSITIVE);
+//	}
+//	
+//	public void prependOptional( DependencyRegex dregex ) {
+//		this.regex = compile( "("+dregex.toString()+")*?" + this.toString(),
+//				Pattern.CASE_INSENSITIVE);
+//	}
+
+	// Combine DependencyRegexes 
+	public DependencyRegex append( DependencyRegex dregex ) {
+		DependencyRegex newregex = new DependencyRegex();
+		newregex.regex = compile( this.toString() + dregex.toString(),
+				Pattern.CASE_INSENSITIVE);
+		return newregex;
+	}
+	
+	public DependencyRegex appendOptional( DependencyRegex dregex ) {
+		DependencyRegex newregex = new DependencyRegex();
+		newregex.regex = compile( this.toString() + "("+dregex.toString()+")*",
+				Pattern.CASE_INSENSITIVE);
+		return newregex;
+	}
+
+	public DependencyRegex prepend( DependencyRegex dregex ) {
+		DependencyRegex newregex = new DependencyRegex();
+		newregex.regex = compile( dregex.toString() + this.toString(),
+				Pattern.CASE_INSENSITIVE);
+		return newregex;
+	}
+	
+	public DependencyRegex prependOptional( DependencyRegex dregex ) {
+		DependencyRegex newregex = new DependencyRegex();
+		newregex.regex = compile( "("+dregex.toString()+")*?" + this.toString(),
+				Pattern.CASE_INSENSITIVE);
+		return newregex;
+	}
+
+	// Add dnodes to DependencyRegex
+	public DependencyRegex append( DEPNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,-1);
+		return append(addregex);
+	}
+	
+	public DependencyRegex appendOptional( DEPNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,-1);
+		return appendOptional(addregex);
+	}
+
+	public DependencyRegex prepend( DEPNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,1024);
+		return prepend(addregex);
+	}
+	
+	public DependencyRegex prependOptional( DEPNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,1024);
+		return prependOptional(addregex);
+	}
+
+	// Add ConllDependencyNodes to DependencyRegex
+	public DependencyRegex append( ConllDependencyNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,-1);
+		return append(addregex);
+	}
+	
+	public DependencyRegex appendOptional( ConllDependencyNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,-1);
+		return appendOptional(addregex);
+	}
+
+	public DependencyRegex prepend( ConllDependencyNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,1024);
+		return prepend(addregex);
+	}
+	
+	public DependencyRegex prependOptional( ConllDependencyNode[] dnodes ) {
+		DependencyRegex addregex = new DependencyRegex(dnodes,1024);
+		return prependOptional(addregex);
+	}
+}