You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by pa...@apache.org on 2022/01/07 04:16:45 UTC

svn commit: r1896777 - /ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/NegexAnnotator.java

Author: pabramowitsch
Date: Fri Jan  7 04:16:44 2022
New Revision: 1896777

URL: http://svn.apache.org/viewvc?rev=1896777&view=rev
Log:
Better negation coverage for compound sentences, performance.   
Note:  needs new file in res:  negex_excluded_keys.txt

Modified:
    ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/NegexAnnotator.java

Modified: ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/NegexAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/NegexAnnotator.java?rev=1896777&r1=1896776&r2=1896777&view=diff
==============================================================================
--- ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/NegexAnnotator.java (original)
+++ ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/NegexAnnotator.java Fri Jan  7 04:16:44 2022
@@ -20,6 +20,7 @@ package org.apache.ctakes.ytex.uima.anno
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.lang.reflect.InvocationTargetException;
 import java.nio.CharBuffer;
@@ -27,9 +28,11 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -51,9 +54,11 @@ import org.apache.uima.resource.Resource
 
 
 /**
- * Negex adapted to cTAKES. Checks negation status of named entities. Loads
- * negex triggers from classpath:
+ * Negex adapted to cTAKES. Checks negation status of named entities. 
+ * `Loads negex triggers from classpath:
  * <tt>/org/apache/ctakes/ytex/uima/annotators/negex_triggers.txt</tt>
+ * Loads negex ignore words from classpath:
+ * <tt>/org/apache/ctakes/ytex/uima/annotators/negex_excluded_keys.txt</tt>
  * <p/>
  * The meaning of the certainty and confidence attributes is nowhere documented
  * for cTakes. There are several ways of handling 'maybes', see below. Default
@@ -84,7 +89,7 @@ import org.apache.uima.resource.Resource
  * an annotation type. Will see if it is negated; if so will set the negated and
  * possible boolean values on the annotation.
  * 
- * @author vijay
+ * @author vijay, heavily updated by Peter A.
  * 
  */
 @PipeBitInfo(
@@ -93,18 +98,37 @@ import org.apache.uima.resource.Resource
 		dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
 )
 public class NegexAnnotator extends JCasAnnotator_ImplBase {
+	private static final String NEGEX_EXCLUDED_KEYS = "/org/apache/ctakes/ytex/uima/annotators/negex_excluded_keys.txt";
+	private static final String NEGEX_TRIGGERS = "/org/apache/ctakes/ytex/uima/annotators/negex_triggers.txt";
 	private static final Log log = LogFactory.getLog(NegexAnnotator.class);
-	private List<NegexRule> listNegexRules = null;
 	private boolean negatePossibilities = true;
 	private boolean checkPossibilities = true;
 	private boolean storeAsInterval = false;
 	private String targetTypeName = null;
+	// only look for rules matching around the NE
+	// by this window.  Long unpunctuated notes
+	// may display tens of spurious matches which 
+	// are thrown away for each NE.
+	private final int MATCHER_WINDOW = 200;
+	
+	private final int STOP_INIT = Integer.MAX_VALUE;
+
+	private HashMap<String,ArrayList<NegexRule>> wordCloud = new HashMap<String,ArrayList<NegexRule>>(300);
+	
+	// throwaway words used in negation expressions that would result in performing extra matches
+	private List<String> excludedKeyWords = null;
+	private int ruleCount;
+	
+	private final static String[] tagList = { "[CONJ]", "[PSEU]", "[PREN]", "[POST]", "[POSP]" };
+
 
 	@Override
 	public void initialize(UimaContext aContext)
 			throws ResourceInitializationException {
 		super.initialize(aContext);
-		this.listNegexRules = this.initializeRules();
+		this.excludedKeyWords = this.initalizeExcludedKeyWords();
+		this.initializeRules();
+		
 		negatePossibilities = getBooleanConfigParam(aContext,
 				"negatePossibilities", negatePossibilities);
 		if (negatePossibilities) {
@@ -125,95 +149,156 @@ public class NegexAnnotator extends JCas
 		return paramValue == null ? defaultVal : paramValue;
 
 	}
-
-	private List<String> initalizeRuleList() {
-		List<String> rules = new ArrayList<String>();
+	
+	private List<String> listReader(String path) throws ResourceInitializationException {
+		List<String> list = new ArrayList<String>();
 		BufferedReader reader = null;
 		try {
-			reader = new BufferedReader(new InputStreamReader(this.getClass()
-					.getResourceAsStream(
-							"/org/apache/ctakes/ytex/uima/annotators/negex_triggers.txt")));
+			InputStream stream = this.getClass().getResourceAsStream(path);
+			if (stream == null) {
+				log.error("Unable to find resource: " + path);
+				throw new ResourceInitializationException(path, null);
+			}
+			reader = new BufferedReader(new InputStreamReader(stream));
 			String line = null;
 			try {
 				while ((line = reader.readLine()) != null)
-					rules.add(line);
+					if (line.charAt(0) != '#') {
+						list.add(line);
+					}
 			} catch (IOException e) {
-				log.error("oops", e);
-			}
-			Collections.sort(rules, new Comparator<String>() {
-
-				@Override
-				public int compare(String o1, String o2) {
-					int l1 = o1.trim().length();
-					int l2 = o2.trim().length();
-					if (l1 < l2)
-						return 1;
-					else if (l1 > l2)
-						return -1;
-					else
-						return 0;
-				}
-
-			});
+				log.error("Error reading list: " + path, e);
+			} 
+			
 		} finally {
 			try {
 				if (reader != null)
 					reader.close();
 			} catch (IOException e) {
-				log.error("oops", e);
+				log.error("Error closing list", e);
 			}
 		}
+		return list;
+	}
+	
+	private List<String> initalizeRuleList() throws ResourceInitializationException {
+		List<String> rules = listReader(NEGEX_TRIGGERS);
+		Collections.sort(rules, new Comparator<String>() {
+			@Override
+			public int compare(String o1, String o2) {
+				int l1 = o1.trim().length();
+				int l2 = o2.trim().length();
+				if (l1 < l2)
+					return 1;
+				else if (l1 > l2)
+					return -1;
+				else
+					return 0;
+			}
+
+		});
 		return rules;
 	}
+	
+	private List<String> initalizeExcludedKeyWords() throws ResourceInitializationException {
+		return listReader(NEGEX_EXCLUDED_KEYS);
+	}
 
-	private List<NegexRule> initializeRules() {
+	private void initializeRules() throws ResourceInitializationException {
 		List<String> listRules = this.initalizeRuleList();
-		List<NegexRule> listNegexRules = new ArrayList<NegexRule>(
-				listRules.size());
+		this.ruleCount = listRules.size();
 		Iterator<String> iRule = listRules.iterator();
 		while (iRule.hasNext()) {
 			String rule = iRule.next();
 			Pattern p = Pattern.compile("[\\t]+"); // Working.
 			String[] ruleTokens = p.split(rule.trim());
 			if (ruleTokens.length == 2) {
-				// Add the regular expression characters to tokens and asemble
-				// the
-				// rule again.
+				// Add the regular expression characters to tokens and assemble
+				// the rule again.
 				String[] ruleMembers = ruleTokens[0].trim().split(" ");
 				String rule2 = "";
+				boolean punctRule = false;
 				for (int i = 0; i <= ruleMembers.length - 1; i++) {
 					if (!ruleMembers[i].equals("")) {
 						if (ruleMembers.length == 1) {
-							rule2 = ruleMembers[i];
+							if (ruleMembers[0].length() == 1) {
+								String chrRule = ruleMembers[0];
+								if (chrRule.equals("\\")) {
+									chrRule += "\\";
+								}
+								rule2 = "[" + chrRule + "]";
+								punctRule = true;
+							} else {
+								rule2 = ruleMembers[i];
+							}	
 						} else {
 							rule2 = rule2 + ruleMembers[i].trim() + "\\s+";
 						}
 					}
 				}
-				// Remove the last s+
+				// Remove the last \\s+ (we will re-add it below)
 				if (rule2.endsWith("\\s+")) {
 					rule2 = rule2.substring(0, rule2.lastIndexOf("\\s+"));
 				}
+				
+				String rule3 = null;
 
-				String rule3 = "(?m)(?i)[[\\p{Punct}&&[^\\]\\[]]|\\s+]("
-						+ rule2 + ")[[\\p{Punct}&&[^_]]|\\s+]";
+				if (punctRule) {
+					rule3 = rule2 + "\\s+";
+				} else {
+					rule3 = "(?m)(?i)[[\\p{Punct}&&[^\\]\\[]]|\\s+]("
+							+ rule2 + ")[[\\p{Punct}&&[^_]]|\\s+]";
+				}
 
 				Pattern p2 = Pattern.compile(rule3.trim());
-				listNegexRules.add(new NegexRule(p2, rule2, ruleTokens[1]
-						.trim()));
+				NegexRule aRule = new NegexRule(p2, rule2, ruleTokens[1].trim());
+				populateWordCloud(ruleMembers, aRule);
+				
 			} else {
 				log.warn("could not parse rule:" + rule);
 			}
-			// Matcher m = p2.matcher(sentence);
-			//
-			// while (m.find() == true) {
-			// sentence = m.replaceAll(" " + ruleTokens[1].trim()
-			// + m.group().trim().replaceAll(" ", filler)
-			// + ruleTokens[1].trim() + " ");
-			// }
 		}
-		return listNegexRules;
+		if (log.isDebugEnabled()) {
+			// this helps to manualy populate the excluded words list
+			for (Entry<String, ArrayList<NegexRule>> entry : wordCloud.entrySet()) {
+				log.debug("key: " + entry.getKey() + "  expression count: " + entry.getValue().size());
+			}
+		}
+		return;
+
+	}
 
+	/**
+	 * Creates a hashtable of arrays where the keys are words in regexps that can predict
+	 * a potential hit.  Each array is a bin of regex rules that are triggered if the key word is
+	 * found in the incoming sentence text.  Note that rules can live in multiple bins, so at the moment
+	 * of execution we also make sure that any single rule is only executed once for each NE.
+	 * eg.  there is an entry with key 'no' and array of all rules containing "no" in their Regex.
+	 * We will run these rules only on sentences that contain the string "no" 
+	 * 
+	 * It's not perfect, but it greatly reduces the number of match calls from the previous version
+	 * which blindly executed  all of them all the time.
+	 * 
+	 * @param ruleMembers
+	 * @param aRule
+	 */
+	private void populateWordCloud(String[] ruleMembers, NegexRule aRule) {
+		for(String token : ruleMembers) {
+			boolean skip = false;
+			for(String stp : this.excludedKeyWords) {
+				skip = (stp.equals(token));
+				if(skip == true) break;
+			}
+			if (!skip) {
+				ArrayList<NegexRule> entry = wordCloud.get(token);
+				if(entry == null) {
+					entry = new ArrayList<NegexRule>();
+					wordCloud.put(token, entry);
+					log.debug("Created a wordcloud bin for: " + token);
+				}
+				entry.add(aRule);
+			}
+		}
 	}
 
 	public static interface TargetAnnoFilter {
@@ -227,7 +312,6 @@ public class NegexAnnotator extends JCas
 	 * 
 	 */
 	public static class NamedEntityTargetAnnoFilter implements TargetAnnoFilter {
-
 		@Override
 		public boolean filter(Annotation anno) {
 			if (!(anno instanceof IdentifiedAnnotation))
@@ -236,14 +320,13 @@ public class NegexAnnotator extends JCas
 			return ia.getOntologyConceptArr() != null
 					&& ia.getOntologyConceptArr().size() > 0;
 		}
-
 	}
 
 	@Override
 	public void process(JCas aJCas) {
-		AnnotationIndex sentenceIdx = aJCas
+		AnnotationIndex<?> sentenceIdx = aJCas
 				.getAnnotationIndex(Sentence.typeIndexID);
-		AnnotationIndex neIdx = aJCas
+		AnnotationIndex<?> neIdx = aJCas
 				.getAnnotationIndex(IdentifiedAnnotation.typeIndexID);
 		negateAnnotations(aJCas, sentenceIdx, neIdx,
 				new NamedEntityTargetAnnoFilter());
@@ -260,17 +343,28 @@ public class NegexAnnotator extends JCas
 		}
 	}
 
-	private void negateAnnotations(JCas aJCas, AnnotationIndex sentenceIdx,
-			AnnotationIndex targetIdx, TargetAnnoFilter filter) {
-		FSIterator sentenceIter = sentenceIdx.iterator();
+	private void negateAnnotations(JCas aJCas, AnnotationIndex<?> sentenceIdx,
+			AnnotationIndex<?> targetIdx, TargetAnnoFilter filter) {
+		FSIterator<?> sentenceIter = sentenceIdx.iterator();
+		// initialize to beyond end of sentence
+		int lastStop = STOP_INIT;
 		while (sentenceIter.hasNext()) {
 			Sentence s = (Sentence) sentenceIter.next();
-			FSIterator neIter = targetIdx.subiterator(s);
+			String sText = "." + s.getCoveredText().toLowerCase() + ".";
+			FSIterator<?> neIter = targetIdx.subiterator(s);
 			while (neIter.hasNext()) {
 				Annotation ne = (Annotation) neIter.next();
-				if (filter == null || filter.filter(ne))
-					checkNegation(aJCas, s, ne);
-				// checkNegation2(aJCas, s, ne);
+				if (filter == null || filter.filter(ne)) {
+					int thisStop = checkNegation(aJCas, sText, s,  ne, lastStop);
+					// pick up from the last 
+					// CONJ tag and move forward.
+					// [preneg?] NE NE [postneg?] [CONJ] (next possible negations)
+					// reduces the number of matcher calls in complex sentences.
+					if (thisStop > lastStop || lastStop == STOP_INIT) {
+						lastStop = thisStop;
+					}
+					log.debug("LastStop:" + lastStop);
+				}
 			}
 		}
 	}
@@ -460,62 +554,57 @@ public class NegexAnnotator extends JCas
 	 * 
 	 * @param aJCas
 	 *            for adding annotations
+	 * @param sText
+	 * 				the covered text bracketed by . so it doesn't have to be re-done for each annotation in a sentence
 	 * @param s
 	 *            the sentence in which we will look
 	 * @param ne
 	 *            the named entity whose negation status will be checked.
-	 * @param checkPoss
-	 *            should possibility be checked?
-	 * @param negPoss
-	 *            should possiblities be negated?
+	 * @param lastpos
+	 * 			In the latter parts of compound sentences where the negation mode shifts, we can use this 
+	 * 			to ignore what we've already processed, to reduce the number of unnecessary regex matches.
+	 * @return endIndex for a possible [CONJ] after the current annotation.  This helps
+	 			  reset the start for subsequent regex scans to after the [CONJ]
 	 */
-	private void checkNegation(JCas aJCas, Sentence s, Annotation ne) {
+	private int checkNegation(JCas aJCas, String sText, Sentence s, Annotation ne, int lastStop) {
 		if (storeAsInterval && ne instanceof IdentifiedAnnotation) {
 			// default is affirmed, which is coded as confidence = 1
 			((IdentifiedAnnotation) ne).setConfidence(1);
 		}
-		// need to add . on either side due to the way the regexs are built
-		String sentence = "." + s.getCoveredText() + ".";
 		// allocate array of tokens
 		// this maps each character of the sentence to a token
-		NegexToken[] tokens = new NegexToken[sentence.length()];
+		NegexToken[] tokens = new NegexToken[sText.length()];
 		// char buffer for modify the sentence
 		// we want to 'black out' trigger words already found and the phrase we
 		// were looking for
-		CharBuffer buf = CharBuffer.wrap(sentence.toCharArray());
+		CharBuffer buf = CharBuffer.wrap(sText.toCharArray());
 		// calculate location of the ne relative to the sentence
 		int neRelStart = ne.getBegin() - s.getBegin() + 1;
 		int neRelEnd = ne.getEnd() - s.getBegin() + 1;
-		// black out the ne in the sentence buffer
+		
 		for (int i = neRelStart; i < neRelEnd; i++) {
 			// black out the named entity from the char buffer
 			buf.put(i, '_');
 		}
-		// look for negex rules in the sentence
-		for (NegexRule rule : this.listNegexRules) {
-			Matcher m = rule.getPattern().matcher(buf);
-			while (m.find() == true) {
-				// see if the range has not already been marked
-				boolean bUnoccupied = true;
-				for (int i = m.start(); i < m.end() && bUnoccupied; i++)
-					bUnoccupied = tokens[i] == null;
-				if (bUnoccupied) {
-					// mark the range in the sentence with this token
-					// black it out so other rules do not match
-					NegexToken t = new NegexToken(m.start(), m.end(), rule);
-					for (int i = m.start(); i < m.end() && bUnoccupied; i++) {
-						// black out this range from the char buffer
-						buf.put(i, '_');
-						// add the token to the array
-						tokens[i] = t;
-					}
-				}
-			}
-		}
-		// prenegation
+
+		// but if there was a stop clause from a previous phrase, blank up until then too.
+		// because we may have a new negation 
+		if (lastStop != 0 && lastStop < neRelStart) {
+			for (int i = 0; i < lastStop ; i++) {
+				buf.put(i, '_');
+			}
+		}
+		
+		if(log.isDebugEnabled()) {
+			// look for negex rules in the sentence
+			log.debug("Negex sentence: " + s.getCoveredText());
+			log.debug("Negex NE: ("+neRelStart+","+neRelEnd+")" + ne.getCoveredText());
+		}
+		
+		populateHits(tokens, buf, neRelStart, neRelEnd);
+		// pre-negation
 		// look for a PREN rule before the ne, without any intervening stop tags
-		NegexToken t = this.findTokenByTag("[PREN]", new String[] { "[CONJ]",
-				"[PSEU]", "[POST]", "[PREP]", "[POSP]" }, true, neRelStart,
+		NegexToken t = this.findTokenByTag("[PREN]", tagList, true, neRelStart,
 				neRelEnd, tokens);
 		if (t != null) {
 			// hit - negate the ne
@@ -523,283 +612,114 @@ public class NegexAnnotator extends JCas
 		} else {
 			// look for POST rule after the ne, without any intervening stop
 			// tags
-			t = this.findTokenByTag("[POST]", new String[] { "[CONJ]",
-					"[PSEU]", "[PREN]", "[PREP]", "[POSP]" }, false,
+			t = this.findTokenByTag("[POST]", tagList, false,
 					neRelStart, neRelEnd, tokens);
 			if (t != null) {
 				annotateNegation(aJCas, s, ne, t, true, false);
 			} else if (this.checkPossibilities || this.negatePossibilities) {
 				// check possibles
-				t = this.findTokenByTag("[PREP]", new String[] { "[CONJ]",
-						"[PSEU]", "[PREN]", "[POST]", "[POSP]" }, true,
+				t = this.findTokenByTag("[PREP]", tagList, true,
 						neRelStart, neRelEnd, tokens);
 				if (t != null) {
 					annotateNegation(aJCas, s, ne, t, false, true);
 				} else {
-					t = this.findTokenByTag("[POSP]", new String[] { "[CONJ]",
-							"[PSEU]", "[PREN]", "[POST]", "[PREP]" }, false,
+					t = this.findTokenByTag("[POSP]", tagList, false,
 							neRelStart, neRelEnd, tokens);
 					if (t != null)
 						annotateNegation(aJCas, s, ne, t, true, true);
 				}
 			}
 		}
-	}
 
-	private void checkNegation2(JCas aJCas, Sentence s,
-			IdentifiedAnnotation ne, boolean negPoss) {
-		// Sorter s = new Sorter();
-		String sToReturn = "";
-		String sScope = "";
-		// String sentencePortion = "";
-		// ArrayList sortedRules = null;
-
-		String filler = "_";
-		// boolean negationScope = true;
-
-		// Sort the rules by length in descending order.
-		// Rules need to be sorted so the longest rule is always tried to match
-		// first.
-		// Some of the rules overlap so without sorting first shorter rules
-		// (some of them POSSIBLE or PSEUDO)
-		// would match before longer legitimate negation rules.
-		//
-
-		// There is efficiency issue here. It is better if rules are sorted by
-		// the
-		// calling program once and used without sorting in GennegEx.
-		// sortedRules = this.rules;
-
-		// Process the sentence and tag each matched negation
-		// rule with correct negation rule tag.
-		//
-		// At the same time check for the phrase that we want to decide
-		// the negation status for and
-		// tag the phrase with [PHRASE] ... [PHRASE]
-		// In both the negation rules and in the phrase replace white space
-		// with "filler" string. (This could cause problems if the sentences
-		// we study has "filler" on their own.)
-
-		// Sentence needs one character in the beginning and end to match.
-		// We remove the extra characters after processing.
-		// vng String sentence = "." + sentenceString + ".";
-		String sentence = "." + s.getCoveredText() + ".";
-
-		// Tag the phrases we want to detect for negation.
-		// Should happen before rule detection.
-		// vng String phrase = phraseString;
-		String phrase = ne.getCoveredText();
-		Pattern pph = Pattern.compile(phrase.trim(), Pattern.CASE_INSENSITIVE);
-		Matcher mph = pph.matcher(sentence);
-		CharBuffer buf = CharBuffer.wrap(sentence.toCharArray());
-
-		while (mph.find() == true) {
-			sentence = mph.replaceAll(" [PHRASE]"
-					+ mph.group().trim().replaceAll(" ", filler) + "[PHRASE]");
-		}
-
-		for (NegexRule rule : this.listNegexRules) {
-			Matcher m = rule.getPattern().matcher(sentence);
-			while (m.find() == true) {
-				sentence = m.replaceAll(" " + rule.getTag()
-						+ m.group().trim().replaceAll(" ", filler)
-						+ rule.getTag() + " ");
-			}
-		}
-
-		// Exchange the [PHRASE] ... [PHRASE] tags for [NEGATED] ... [NEGATED]
-		// based of PREN, POST rules and if flag is set to true
-		// then based on PREP and POSP, as well.
-
-		// Because PRENEGATION [PREN} is checked first it takes precedent over
-		// POSTNEGATION [POST].
-		// Similarly POSTNEGATION [POST] takes precedent over POSSIBLE
-		// PRENEGATION [PREP]
-		// and [PREP] takes precedent over POSSIBLE POSTNEGATION [POSP].
-
-		Pattern pSpace = Pattern.compile("[\\s+]");
-		String[] sentenceTokens = pSpace.split(sentence);
-		StringBuilder sb = new StringBuilder();
-
-		// Check for [PREN]
-		for (int i = 0; i < sentenceTokens.length; i++) {
-			sb.append(" " + sentenceTokens[i].trim());
-			if (sentenceTokens[i].trim().startsWith("[PREN]")) {
-
-				for (int j = i + 1; j < sentenceTokens.length; j++) {
-					if (sentenceTokens[j].trim().startsWith("[CONJ]")
-							|| sentenceTokens[j].trim().startsWith("[PSEU]")
-							|| sentenceTokens[j].trim().startsWith("[POST]")
-							|| sentenceTokens[j].trim().startsWith("[PREP]")
-							|| sentenceTokens[j].trim().startsWith("[POSP]")) {
-						break;
-					}
+		// now see if we found the first stop clause (CONJ) only after our current entity
+		// if not actions need to proceed from sentence start
 
-					if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
-						sentenceTokens[j] = sentenceTokens[j].trim()
-								.replaceAll("\\[PHRASE\\]", "[NEGATED]");
-					}
-				}
-			}
-		}
+		if (findTokenByTag("[CONJ]", tagList, false, neRelEnd,
+				neRelEnd, tokens) != null) {
+			return 0;
+		}
+		
+		// now see if we found a stop clause (CONJ) before our current entity
+		// if so, set up a blanking index we can use in the next iteration to ignore
+		// any possible negations that came before the semantic break.
+		//    "no headache but reports rash"  
+		//     "rash but not headache"
+		// should bother work
+
+		// now look for one before current annotation
+		t = this.findTokenByTag("[CONJ]", tagList, true, neRelStart,
+				neRelStart, tokens);
+		if (t != null) 
+			return t.getEnd();
 
-		sentence = sb.toString();
-		pSpace = Pattern.compile("[\\s+]");
-		sentenceTokens = pSpace.split(sentence);
-		StringBuilder sb2 = new StringBuilder();
-
-		// Check for [POST]
-		for (int i = sentenceTokens.length - 1; i > 0; i--) {
-			sb2.insert(0, sentenceTokens[i] + " ");
-			if (sentenceTokens[i].trim().startsWith("[POST]")) {
-				for (int j = i - 1; j > 0; j--) {
-					if (sentenceTokens[j].trim().startsWith("[CONJ]")
-							|| sentenceTokens[j].trim().startsWith("[PSEU]")
-							|| sentenceTokens[j].trim().startsWith("[PREN]")
-							|| sentenceTokens[j].trim().startsWith("[PREP]")
-							|| sentenceTokens[j].trim().startsWith("[POSP]")) {
-						break;
-					}
+		return 0;
+	}
 
-					if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
-						sentenceTokens[j] = sentenceTokens[j].trim()
-								.replaceAll("\\[PHRASE\\]", "[NEGATED]");
+	/**
+	 * Peter A. 12-2021
+	 * Try to execute as few Regex operations as possible
+	 * We do this by only testing rules that have at least one word in common with the current sentence frag
+	 * with the words in sentence buf.  We are eliminating the rules we know would fail.
+	 * and matcher is much less efficient than indexOf
+	 * @param tokens
+	 * @param buf
+	 * @param neEnd 
+	 * @param neStart 
+	 */
+	private void populateHits(NegexToken[] tokens, CharBuffer buf, int neStart, int neEnd) {
+		String bText = buf.toString();
+		int count = 0;
+		HashMap<Integer,Integer> deDupe = new HashMap<Integer,Integer>(this.ruleCount);
+		for (Entry<String, ArrayList<NegexRule>> ent : this.wordCloud.entrySet()) {
+			if (bText.indexOf(ent.getKey()) >= 0) {
+				for (NegexRule rule : ent.getValue()) {
+					Integer iH = new Integer(rule.hashCode());
+					if (deDupe.containsKey(iH)) {
+						// do not execute the same rule twice
+						// this is because in the wordCloud, same rules may occur in different bins
+						continue;
 					}
-				}
-			}
-		}
-		sentence = sb2.toString();
-
-		// If POSSIBLE negation is detected as negation.
-		// negatePossible being set to "true" then check for [PREP] and [POSP].
-		if (negPoss == true) {
-			pSpace = Pattern.compile("[\\s+]");
-			sentenceTokens = pSpace.split(sentence);
-
-			StringBuilder sb3 = new StringBuilder();
-
-			// Check for [PREP]
-			for (int i = 0; i < sentenceTokens.length; i++) {
-				sb3.append(" " + sentenceTokens[i].trim());
-				if (sentenceTokens[i].trim().startsWith("[PREP]")) {
-
-					for (int j = i + 1; j < sentenceTokens.length; j++) {
-						if (sentenceTokens[j].trim().startsWith("[CONJ]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[PSEU]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[POST]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[PREN]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[POSP]")) {
-							break;
-						}
-
-						if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
-							sentenceTokens[j] = sentenceTokens[j].trim()
-									.replaceAll("\\[PHRASE\\]", "[POSSIBLE]");
+					deDupe.put(iH, iH);
+					Matcher m = rule.getPattern().matcher(buf);
+					count++;
+					while (m.find() == true) {
+						if (log.isDebugEnabled()) {
+							log.debug("Regex buf before: " + buf.toString());
+							log.debug("rule: \'" + rule.getRule() + "\' match at :" + m.start() + "," + m.end() );
 						}
-					}
-				}
-			}
-			sentence = sb3.toString();
-			pSpace = Pattern.compile("[\\s+]");
-			sentenceTokens = pSpace.split(sentence);
-			StringBuilder sb4 = new StringBuilder();
-
-			// Check for [POSP]
-			for (int i = sentenceTokens.length - 1; i > 0; i--) {
-				sb4.insert(0, sentenceTokens[i] + " ");
-				if (sentenceTokens[i].trim().startsWith("[POSP]")) {
-					for (int j = i - 1; j > 0; j--) {
-						if (sentenceTokens[j].trim().startsWith("[CONJ]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[PSEU]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[PREN]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[PREP]")
-								|| sentenceTokens[j].trim()
-										.startsWith("[POST]")) {
+						// in poorly punctuated notes ignore matches which occur far from the NE we
+						// are judging.
+						if ((m.start() < Math.max(0, neStart - MATCHER_WINDOW)) ||
+								m.end() > (neEnd + MATCHER_WINDOW)) {
 							break;
 						}
-
-						if (sentenceTokens[j].trim().startsWith("[PHRASE]")) {
-							sentenceTokens[j] = sentenceTokens[j].trim()
-									.replaceAll("\\[PHRASE\\]", "[POSSIBLE]");
+						deDupe.clear();
+						boolean bUnoccupied = true;
+						// When two adjacent rules share the same punctuation or space
+						// code must allow for overlap of one character e.g. in the phrase  "A but no B" 
+						// " but " is CONJ while " no " is PREN.  The space between then shows up on both
+						// regex matches!!!
+						for (int i = m.start(); i < m.end() && bUnoccupied; i++)
+							bUnoccupied = (tokens[i] == null || (tokens[i].getEnd() - 1) == i);
+						
+						if (bUnoccupied) {
+							// mark the range in the sentence with this token
+							NegexToken t = new NegexToken(m.start(), m.end(), rule);
+							for (int i = m.start(); i < m.end() && bUnoccupied; i++) {
+								// blank out this range from the char buffer
+								buf.put(i, '_');
+								// add the token to the array
+								tokens[i] = t;
+							}
+							// sync text with new buf;
+							log.debug("Regex buf after: " + buf.toString());
+							bText = buf.toString();
 						}
 					}
 				}
 			}
-			sentence = sb4.toString();
 		}
-
-		// Remove the filler character we used.
-		sentence = sentence.replaceAll(filler, " ");
-
-		// Remove the extra periods at the beginning
-		// and end of the sentence.
-		sentence = sentence.substring(0, sentence.trim().lastIndexOf('.'));
-		sentence = sentence.replaceFirst(".", "");
-
-		// Get the scope of the negation for PREN and PREP
-		if (sentence.contains("[PREN]") || sentence.contains("[PREP]")) {
-			int startOffset = sentence.indexOf("[PREN]");
-			if (startOffset == -1) {
-				startOffset = sentence.indexOf("[PREP]");
-			}
-
-			int endOffset = sentence.indexOf("[CONJ]");
-			if (endOffset == -1) {
-				endOffset = sentence.indexOf("[PSEU]");
-			}
-			if (endOffset == -1) {
-				endOffset = sentence.indexOf("[POST]");
-			}
-			if (endOffset == -1) {
-				endOffset = sentence.indexOf("[POSP]");
-			}
-			if (endOffset == -1 || endOffset < startOffset) {
-				endOffset = sentence.length() - 1;
-			}
-			sScope = sentence.substring(startOffset, endOffset + 1);
-		}
-
-		// Get the scope of the negation for POST and POSP
-		if (sentence.contains("[POST]") || sentence.contains("[POSP]")) {
-			int endOffset = sentence.lastIndexOf("[POST]");
-			if (endOffset == -1) {
-				endOffset = sentence.lastIndexOf("[POSP]");
-			}
-
-			int startOffset = sentence.lastIndexOf("[CONJ]");
-			if (startOffset == -1) {
-				startOffset = sentence.lastIndexOf("[PSEU]");
-			}
-			if (startOffset == -1) {
-				startOffset = sentence.lastIndexOf("[PREN]");
-			}
-			if (startOffset == -1) {
-				startOffset = sentence.lastIndexOf("[PREP]");
-			}
-			if (startOffset == -1) {
-				startOffset = 0;
-			}
-			sScope = sentence.substring(startOffset, endOffset);
-		}
-
-		// Classify to: negated/possible/affirmed
-		if (sentence.contains("[NEGATED]")) {
-			sentence = sentence + "\t" + "negated" + "\t" + sScope;
-		} else if (sentence.contains("[POSSIBLE]")) {
-			sentence = sentence + "\t" + "possible" + "\t" + sScope;
-		} else {
-			sentence = sentence + "\t" + "affirmed" + "\t" + sScope;
-		}
-
-		sToReturn = sentence;
-		System.out.println(sToReturn);
+		log.debug("Rules tried: " + count);
 	}
 
 	/**
@@ -843,7 +763,10 @@ public class NegexAnnotator extends JCas
 			}
 		}
 		ContextAnnotation nec = new ContextAnnotation(aJCas);
-		nec.setBegin(s.getBegin() + t.getStart() - 1);
+		// There is a bug way back in UIMA source which occasionally
+		// returns a begin of -1 when certain POS types begin a sentence.
+		int begin = Math.max(0, (s.getBegin() + t.getStart() - 1));
+		nec.setBegin(begin);
 		nec.setEnd(s.getBegin() + t.getEnd() - 1);
 		nec.setScope(t.getTag());
 		nec.setFocusText(anno.getCoveredText());