You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2009/04/17 16:08:25 UTC
svn commit: r766019 [2/2] - in /incubator/uima/sandbox/trunk/ConceptMapper:
docbook/ConceptMapperAnnotatorUserGuide/
src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/
src/main/java/org/apache/uima/conceptMapper/support/dictionaryR...
Modified: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource_impl.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource_impl.java?rev=766019&r1=766018&r2=766019&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource_impl.java (original)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource_impl.java Fri Apr 17 14:08:15 2009
@@ -1,996 +1,996 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.uima.conceptMapper.support.dictionaryResource;
-
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Enumeration;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.Properties;
-import java.util.TreeMap;
-import java.util.Vector;
-import java.util.regex.Pattern;
-
-import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
-import org.apache.uima.conceptMapper.Logger;
-import org.apache.uima.conceptMapper.support.dictionaryResource.annotatorAdaptor.AnnotatorAdaptor;
-import org.apache.uima.conceptMapper.support.tokens.TokenFilter;
-import org.apache.uima.conceptMapper.support.tokens.TokenNormalizer;
-import org.apache.uima.resource.DataResource;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.SharedResourceObject;
-import org.xml.sax.Attributes;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;
-import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
-import org.xml.sax.helpers.XMLReaderFactory;
-
-/*
- * @version $Revision: 1.5 $
- *
- * @author Michael Tanenblatt <mt...@us.ibm.com> @author (c) Copyright 2005 IBM @author All Rights
- * Reserved
- */
-/**
- * Implementation of a UIMA DictionaryResource
- */
-
-public class DictionaryResource_impl implements DictionaryResource, SharedResourceObject {
- /** Dictionary file loader. Uses an XML parser. */
- protected DictLoader dictLoader;
-
- /**
- * Hashtable of first words. Contains a DictEntries object keyed on word string for the first word
- * of every entry in the specified dictionary.
- */
- protected Hashtable<String, DictEntriesByLength> dictImpl;
-
- /** Initial size of <code>dict</code> */
- protected static final int NumOfInitialDictEntries = 500000;
-
- boolean loaded;
-
- /** Patterns to for matcher to replace SGML &lt; entities */
- private static final Pattern ltPattern = Pattern.compile("<");
-
- /** Patterns to for matcher to replace SGML &gt; entities */
- private static final Pattern gtPattern = Pattern.compile(">");
-
- /** Patterns to for matcher to replace SGML &apos; entities */
- private static final Pattern aposPattern = Pattern.compile("'");
-
- /** Patterns to for matcher to replace SGML &quot; entities */
- private static final Pattern quotPattern = Pattern.compile(""");
-
- /** Patterns to for matcher to replace SGML &amp; entities */
- private static final Pattern ampPattern = Pattern.compile("&");
-
- /** Configuration parameter key/label for the order independent lookup indicator */
- public static final String PARAM_ORDERINDEPENDENTLOOKUP = "OrderIndependentLookup";
-
- private boolean sortElements = false;
-
- /** Configuration parameter key/label to indicate whether dictionary should be printed upon load */
- public static final String PARAM_DUMPDICT = "PrintDictionary";
-
- private boolean dumpDict = false;
-
- /** Configuration parameter key/label for the case matching string */
- public static final String PARAM_CASE_MATCH = "caseMatch";
-
- /** Configuration parameter key/label for the stemmer class spec. If left out, no stemmer is used */
- public static final String PARAM_STEMMER_CLASS = "Stemmer";
-
- /**
- * Configuration parameter key/label for the stemmer dictionary, passed into the stemmer's
- * initialization method
- */
- public static final String PARAM_STEMMER_DICT = "StemmerDictionary";
-
- private static final String PARAM_LANGID = "LanguageID";
-
- private static final String DEFAULT_LANGID = "en";
-
- public int entryNum = 0;
-
- /**
- * @param NumOfInitialDictEntries
- *
- */
- public DictionaryResource_impl() {
- super();
- dictImpl = new Hashtable<String, DictEntriesByLength>();
- loaded = false;
- }
-
- /**
- * @param NumOfInitialDictEntries
- *
- */
- public DictionaryResource_impl(int initialDictEntries) {
- super();
- dictImpl = new Hashtable<String, DictEntriesByLength>(initialDictEntries);
- loaded = false;
- }
-
- /**
- * @return Returns the dictLoader.
- */
- public DictLoader getDictLoader() {
- return dictLoader;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.conceptMapper.support.DictionaryResource#getEntry(java.lang.String)
- */
- public DictEntriesByLength getEntries(String key) {
- return dictImpl.get(key);
- }
-
- /**
- * Create a new dictionary entry.
- *
- * @param key
- * the key to index on
- * @param phrase
- * the phrase to be entered in the dictionary
- * @param length
- * the number of words in the phrase (>=1)
- * @param props
- * the properties object for the dictionary entry
- */
- public void putEntry(String key, String text, ArrayList<String> elements, String unsorted,
- int length, Properties props) {
- DictEntriesByLength entry = getEntries(key);
-
- if (entry == null) {
- entry = new DictEntriesByLength_impl();
- // System.err.println ("DICT adding: " + key + ", " + text);
- dictImpl.put(key, entry);
- }
- // Iterator<String> elemIter = elements.iterator ();
- // while (elemIter.hasNext ())
- // {
- // System.err.println("ENTRY ELEMENT: " + elemIter.next ());
- // }
- entry.putEntry(length, text, elements, unsorted, props);
- // System.err.println("ENTRY: '" + text + "', PROPS: ");
- // Enumeration propKeys = props.keys();
- // while (propKeys.hasMoreElements())
- // {
- // String propKey = (String) propKeys.nextElement();
- // System.err.println ("\t" + propKey + ": " + props.getProperty(propKey));
- // }
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.resource.SharedResourceObject#load(org.apache.uima.resource.DataResource)
- */
- public void load(DataResource data) throws ResourceInitializationException {
- // InputStream dictStream = null;
- try {
- Logger logger = new Logger("Dictionary Resource Loader", data.getLogger());
- dictLoader = new DictLoader(logger, this, data);
-
- /*
- * At least for now, while info from the AnnotatorContext is not available here, must delay
- * loading until able to access this info
- *
- * //open input stream to data dictStream = data.getInputStream ();
- *
- * dictLoader = new DictLoader (data.getLogger (), this); dictLoader.setDictionary
- * (dictStream, NumOfInitialDictEntries);
- */
- } catch (Exception e) {
- throw new ResourceInitializationException(e);
- }
- /*
- * finally { if (dictStream != null) { try { dictStream.close (); } catch (IOException e) { } } }
- */
- }
-
- public void loadDictionaryContents(AnnotatorContext aContext, Logger logger,
- String tokenAnnotationName, String tokenTypeFeatureName, String tokenClassFeatureName,
- String tokenizerDescriptor) throws ResourceInitializationException {
-
- InputStream dictStream = null;
- try {
- dictLoader.setLogger(logger);
- Boolean sortElementsParam = new Boolean(false);
-
- sortElementsParam = (Boolean) aContext.getConfigParameterValue(PARAM_ORDERINDEPENDENTLOOKUP);
- if (sortElementsParam == null) {
- sortElements = false;
- } else {
- sortElements = sortElementsParam.booleanValue();
- }
-
- Boolean dumpDictParam = (Boolean) aContext.getConfigParameterValue(PARAM_DUMPDICT);
- if (dumpDictParam == null) {
- dumpDict = false;
- } else {
- dumpDict = dumpDictParam.booleanValue();
- }
-
- TokenNormalizer tokenNormalizer = new TokenNormalizer(aContext, logger);
-
- // open input stream to data
- dictStream = dictLoader.getInputStream();
-
- TokenFilter tokenFilter = new TokenFilter(tokenAnnotationName, tokenTypeFeatureName,
- tokenClassFeatureName, logger);
- tokenFilter.initConfig(aContext);
-
- String langID = (String) aContext.getConfigParameterValue(PARAM_LANGID);
- if ((langID == null) || (langID == ""))
- {
- langID = DEFAULT_LANGID;
- }
- // System.out.print ("Loading Dictionary: '" + dictLoader.dataResource.getUri().toString() +
- // "'...");
- // System.out.print ("Loading Dictionary...");
- logger.logInfo("Loading Dictionary...");
- dictLoader.setDictionary(dictStream, NumOfInitialDictEntries, tokenAnnotationName,
- tokenTypeFeatureName, tokenClassFeatureName, tokenizerDescriptor, tokenFilter,
- tokenNormalizer, langID);
- logger.logInfo("...done");
- // System.out.println ("done");
- // System.err.println("NEW DICT:\n" + toString());
- setLoaded(true);
- if (dumpDict)
- {
- System.err.println("BEGIN DUMPING DICTIONARY:");
- System.err.println(toString());
- System.err.println("DONE DUMPING DICTIONARY! ");
- }
-
- } catch (Exception e) {
- throw new ResourceInitializationException(e);
- } finally {
- if (dictStream != null) {
- try {
- dictStream.close();
- } catch (IOException e) {
- }
- }
- }
-
- }
-
- /**
- * @return Returns the loaded.
- */
- public boolean isLoaded() {
- return loaded;
- }
-
- /**
- * @param loaded
- * The loaded to set.
- */
- public void setLoaded(boolean loaded) {
- this.loaded = loaded;
- }
-
- private static class DictEntriesByLength_impl implements DictEntriesByLength {
- private static final long serialVersionUID = -8150386021246495622L;
-
- private class ReverseOrderIntegerComparator implements Comparator<Integer>, Serializable {
-
- /**
- *
- */
- private static final long serialVersionUID = -805437355806223406L;
-
- /*
- * (non-Javadoc)
- *
- * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
- */
- public int compare(Integer left, Integer right) {
- // reverse the order of parameters, to reverse the sorting order
- return (right.compareTo(left));
- }
-
- }
-
- TreeMap<Integer, DictEntries> entries;
-
- public DictEntriesByLength_impl() {
- super();
- entries = new TreeMap<Integer, DictEntries>(new ReverseOrderIntegerComparator());
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntriesByLength#getEntries(int)
- */
- public DictEntries getEntries(int length) {
- return entries.get(new Integer(length));
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntriesByLength#putEntry(int,
- * java.lang.String, java.util.Properties)
- */
- public void putEntry(int length, String text, ArrayList<String> elements, String unsorted,
- Properties props) {
- DictEntries entry = getEntries(length);
- if (entry == null) {
- entry = new DictEntriesImpl();
- entries.put(new Integer(length), entry);
- }
- entry.putEntry(text, elements, unsorted, props);
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntriesByLength#getLongest()
- */
- public Integer getLongest() {
- return entries.firstKey();
- }
-
- public Integer getShortest() {
- return entries.lastKey();
- }
-
- public String toString() {
- StringBuffer result = new StringBuffer();
-
- int i = getLongest().intValue();
- int last = getShortest().intValue();
-
- while (i >= last) {
- DictEntriesImpl entries = (DictEntriesImpl) getEntries(i);
- if (entries != null) {
- result.append("<DictEntriesByLength length='" + i + "'>\n");
- result.append(entries.toString());
- result.append("</DictEntriesByLength>\n");
- }
- i--;
- }
- return result.toString();
- }
- }
-
- /**
- * Private class for storing first words in the dict hashtable.
- */
- public static class DictEntriesImpl extends ArrayList<DictEntry> implements DictEntries {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- /**
- * Create a new dictionary entry.
- */
- public DictEntriesImpl() {
- }
-
- /**
- * Add a new phrase to an existing dictionary entry.
- *
- * @param text
- * the text to be entered in the dictionary
- * @param props
- * the properties object for the phrase
- */
- public void putEntry(String text, Collection<String> elements, String unsorted, Properties props) {
- add(new DictEntryImpl(text, elements, unsorted, props));
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntries#getEntry(java.lang.String)
- */
- public ArrayList<DictEntry> getEntries() {
- return this;
- }
-
- public String toString() {
- StringBuffer result = new StringBuffer("<DictEntries>");
- for (int i = 0; i < size(); i++) {
- result.append(((DictEntryImpl) get(i)).toString());
- }
- result.append("</DictEntries>");
- return result.toString();
- }
-
- }
-
- public static class DictEntryImpl implements DictEntry {
- private static final long serialVersionUID = -7723934333674544157L;
-
- String text;
-
- Collection<String> elements;
-
- String unsorted;
-
- Properties properties;
-
- /**
- * @param text
- * @param elements
- * @param unsorted
- * @param properties
- */
- public DictEntryImpl(String text, Collection<String> elements, String unsorted,
- Properties properties) {
- super();
- this.properties = properties;
- this.unsorted = unsorted;
- this.text = text;
- this.elements = elements;
- }
-
- /**
- * @return Returns the text.
- */
- public String getText() {
- return text;
- }
-
- /**
- * @param text
- * The text to set.
- */
- public void setText(String text) {
- this.text = text;
- }
-
- /**
- * @param properties
- * The properties to set.
- */
- public void setProperties(Properties properties) {
- this.properties = properties;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry#getProperties()
- */
- public Properties getProperties() {
- return properties;
- }
-
- public String toString() {
- StringBuffer result = new StringBuffer("<DictEntry Text ='" + getText() + "'>");
- Enumeration<?> e = getProperties().propertyNames();
-
- while (e.hasMoreElements()) {
- String propName = (String) e.nextElement();
- result.append("<property name='" + propName + "'>");
- String item = getProperties().getProperty(propName);
- result.append(item);
- result.append("</property>\n");
- }
- result.append("</DictEntry>\n");
- return result.toString();
-
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry#getUnsorted()
- */
- public String getUnsorted() {
- return unsorted;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry#setUnsorted(java.lang.String)
- */
- public void setUnsorted(String unsorted) {
- this.unsorted = unsorted;
- }
-
- public Collection<String> getElements() {
- return elements;
- }
-
- public void setElements(Collection<String> elements) {
- this.elements = elements;
- }
-
- }
-
- /**
- * Convert character entities in a string to the corresponding character. The set of entities
- * handled includes:
- * <ul>
- * <li>&amp;
- * <li>&lt;
- * <li>&gt;
- * <li>&apos;
- * <li>&quot;
- * </ul>
- *
- * @param input
- * the string to process.
- *
- * @return the string with converted entities
- */
- protected String convertEntities(String input) {
- String result = ltPattern.matcher(input).replaceAll("<");
- result = gtPattern.matcher(result).replaceAll(">");
- result = aposPattern.matcher(result).replaceAll("'");
- result = quotPattern.matcher(result).replaceAll("\"");
- result = ampPattern.matcher(result).replaceAll("&");
-
- return (result);
- }
-
- /**
- * Private class to load the dictionary file. Extends the org.xml.sax.helpers.DefaultHandler for
- * XML parsing.
- */
- private class DictLoader extends DefaultHandler {
- /** Default parser name. */
- protected static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
-
- /** Default name of element that contains dictionary records. */
- protected static final String DEFAULT_TOKEN_ELEM = "token";
-
- /** Default name of element that contains variant form */
- protected static final String DEFAULT_VARIANT_ELEM = "variant";
-
- /**
- * Default name of attribute in the token element that contains the key for the entry (i.e., the
- * surface or variant form that will be found in the document).
- */
- protected static final String DEFAULT_KEY_ATTR = "base";
-
- /** Default name of element that contains part of speech */
- // protected static final String DEFAULT_POS_ATTR = "POS";
- /** The XML parser that parses the dictionary file. */
- private XMLReader parser = null;
-
- /** The name of the element that contains the dictionary records. */
- private String token_elem = DEFAULT_TOKEN_ELEM;
-
- /**
- * The name of the element that contains a variant form for the current entry
- */
- private String variant_elem = DEFAULT_VARIANT_ELEM;
-
- /** The name of the attribute that contains the key. */
- private String key_attribute = DEFAULT_KEY_ATTR;
-
- /** The name of the attribute that contains the part of speech. */
- // private String pos_attribute = DEFAULT_POS_ATTR;
- /** Count of number of dictionary entries loaded. */
- private int term_cnt = 0;
-
- /** Reference to UIMA logger. */
- private Logger logger;
-
- /** The hashtable built while parsing the dictionary file. */
- private DictionaryResource dict;
-
- /** Properties for current canonical form */
- private Properties props;
-
- private AnnotatorAdaptor adaptor;
-
- private String tokenAnnotationName;
-
- private String tokenizerDescriptor;
-
- private TokenFilter tokenFilter;
-
- private TokenNormalizer tokenNormalizer;
-
- /**
- * needed to access input stream, since cannot load external dict resource until TAE config
- * params are available to set up tokenizer correctly
- */
- DataResource dataResource;
-
- private Vector<DictionaryToken> result;
-
- /**
- * Create a dictionary loader.
- *
- * @param The
- * UIMA logger.
- *
- * @throws Exception
- * if XML parser cannot be created.
- */
- public DictLoader(Logger log, DictionaryResource dict, DataResource data) throws Exception {
- this.setLogger(log);
- this.dict = dict;
- this.dataResource = data;
-
- // create parser
- try {
- parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME);
- } catch (Exception e) {
- log.logError("Unable to instantiate dictionary parser (" + DEFAULT_PARSER_NAME + ")");
- throw (e);
- }
- parser.setContentHandler(this);
- parser.setErrorHandler(this);
-
- }
-
- protected String getTokenAnnotationName() {
- return tokenAnnotationName;
- }
-
- protected void setTokenAnnotationName(String tokenAnnotationName) {
- this.tokenAnnotationName = tokenAnnotationName;
- }
-
- protected String getTokenizerDescriptor() {
- return tokenizerDescriptor;
- }
-
- protected void setTokenizerDescriptor(String tokenizerDescriptor) {
- this.tokenizerDescriptor = tokenizerDescriptor;
- }
-
- protected void setTokenFilter(TokenFilter tokenFilter) {
- this.tokenFilter = tokenFilter;
- }
-
- protected TokenFilter getTokenFilter() {
- return tokenFilter;
- }
-
- protected void setTokenNormalizer(TokenNormalizer tokenNormalizer) {
- this.tokenNormalizer = tokenNormalizer;
- }
-
- protected TokenNormalizer getTokenNormalizer() {
- return tokenNormalizer;
- }
-
- /**
- * @return Returns the log.
- */
- public Logger getLogger() {
- return logger;
- }
-
- /**
- * @param log
- * The log to set.
- */
- public void setLogger(Logger logger) {
- this.logger = logger;
- }
-
- /**
- * Start element. This method does most of the work of building the hashtable.
- *
- */
- public void startElement(String uri, String local, String raw, Attributes attrs)
- throws SAXException {
-
- String key = null;
- DictionaryToken token = null;
- int length = 0;
-
- if (raw.equals(token_elem)) { // starting new token entry
- if (attrs != null) {
-
- props = new Properties();
- int attrCount = attrs.getLength();
- for (int i = 0; i < attrCount; i++) {
- props.setProperty(attrs.getQName(i), convertEntities(attrs.getValue(i)));
- }
- }
- } else if (raw.equals(variant_elem)) { // variant for current token
- if (attrs != null) {
- int attrCount = attrs.getLength();
- ArrayList<String> tokens = new ArrayList<String>();
-
- // if this variant contains its own POS info, save token level POS info and set props to
- // contain variant's
- Properties variantProperties = new Properties(props);
- // logger.logInfo("" + entryNum++);
- //
- // System.err.println("" + entryNum++);
- for (int i = 0; i < attrCount; i++) {
- if (attrs.getQName(i).equals(key_attribute)) { // key attribute?
-
- adaptor.runCPM(convertEntities(attrs.getValue(i)));
-
- //if (dumpDict)
- //{
- // System.err.println ("Adaptor done, result size: " + result.size() + "\nresult:\n");
- // for (DictionaryToken resultItem : result)
- // {
- // System.err.println (" " + resultItem.getText());
- // }
- //}
-
- Iterator<DictionaryToken> tokenIter = result.iterator();
- token = null;
-
- while (tokenIter.hasNext()) {
- token = (DictionaryToken) tokenIter.next();
- //if (dumpDict)
- //{
- // System.err.println ("TOKEN CLASS: '" + token.getTokenClass() + "', TOKEN TYPE: '" + token.getType() + "'");
- //}
- if (tokenFilter.isOK_Token(token, tokenNormalizer)) {
- break;
- }
- }
-
- if (token == null) {
- return;
- }
-
- key = new String(tokenNormalizer.normalize(token.getText()));
- //if (dumpDict)
- //{
- // System.err.println ("variant token key:" + key);
- //}
-
- tokens.add(key);
- length = 1;
- while (tokenIter.hasNext()) {
- token = (DictionaryToken) tokenIter.next();
- String tokenText = tokenNormalizer.normalize(token.getText());
-
- if (tokenFilter.isOK_Token(token, tokenNormalizer)) {
- key += " " + tokenText;
- tokens.add(tokenText);
- length++;
-
- //if (dumpDict)
- //{
- // System.err.println (" variant token aux:" + tokenText);
- //}
- }
- //else
- //{
- // if (dumpDict)
- // {
- // System.err.println (" SKIPPING: variant token aux:" + tokenText);
- // }
- //}
- }
- } else {
- variantProperties.setProperty(attrs.getQName(i), convertEntities(attrs.getValue(i)));
- }
-
- }
-
- String[] elements = (String[]) tokens.toArray(new String[tokens.size()]);
-
- String unsorted = null;
-
- if (sortElements) {
- unsorted = stringTogetherTokens(elements);
- Arrays.sort(elements);
- }
-
- String tokenString = stringTogetherTokens(elements);
- //if (dumpDict)
- //{
- // System.err.println ("token string: " + tokenString);
- //}
-
-
- // add to dictionary
- if (sortElements) {
- for (int i = 0; i < tokens.size(); i++) {
- dict.putEntry((String) tokens.get(i), tokenString, tokens, unsorted, tokens.size(),
- variantProperties);
- // System.err.println ("adding props for:" + tokenString);
- // Enumeration propKeys = variantProperties.keys();
- // while (propKeys.hasMoreElements())
- // {
- // String propKey = (String) propKeys.nextElement();
- // System.err.println ("\t" + propKey + ": " +
- // variantProperties.getProperty(propKey));
- // }
-
- }
- } else {
- dict.putEntry((String) tokens.get(0), tokenString, tokens, unsorted, tokens.size(),
- variantProperties);
- }
- term_cnt++;
- if ((term_cnt % 10000) == 0) {
- getLogger().logInfo("processed " + term_cnt + " entries");
- System.err.println("processed " + term_cnt + " entries");
- }
- }
- }
- }
-
- /**
- * @param elements
- * @return
- */
- private String stringTogetherTokens(String[] elements) {
- StringBuffer tokenString = new StringBuffer();
-
- for (int i = 0; i < elements.length; i++) {
- if (i > 0) {
- tokenString.append(" ");
- }
- tokenString.append(elements[i]);
- }
- return tokenString.toString();
- }
-
- //
- // ErrorHandler methods
- //
- /** Warning. */
- public void warning(SAXParseException ex) throws SAXException {
- getLogger().logWarning(errorString("Warning", ex));
- } // warning(SAXParseException)
-
- /**
- * Error. public void error (SAXParseException ex) throws SAXException { getLogger ().log
- * (Level.SEVERE, errorString ("Error", ex)); } // error(SAXParseException)
- */
- /** Fatal error. */
- public void fatalError(SAXParseException ex) throws SAXException {
- getLogger().logError(errorString("Fatal Error", ex));
- throw ex;
- } // fatalError(SAXParseException)
-
- /** Prints the error message. */
- protected String errorString(String type, SAXParseException ex) {
- String errorMsg = "[" + type + "]";
- if (ex == null) {
- return errorMsg + "!!!";
- }
- String systemId = ex.getSystemId();
- if (systemId != null) {
- int index = systemId.lastIndexOf('/');
- if (index != -1)
- systemId = systemId.substring(index + 1);
- errorMsg += systemId;
- }
- errorMsg += ":" + ex.getLineNumber() + ":" + ex.getColumnNumber() + ": " + ex.getMessage();
- return errorMsg;
- }
-
- /**
- * Use the tokenizer specified in 'tokenizerDescriptor' to load the specified dicitonary file.
- * The dictonary file must be in the format specified above. A new <code>dict</code> hashtable
- * is created with a <code>DictEntries</code> object for each unique first word in the base
- * fields in the dictionary file. The <code>dict</code> hashtable is keyed off of the first
- * word. The <code>DictEntries</code> for a first word contains a hashtable of
- * <code>DictEntry</code> objects for every phrase in the base fields of the dictionary file
- * started by the first word. The phrase hashtable is keyed off of the entire phrase.
- *
- * @param tokenizerDescriptor
- * @param tokenAnnotationName
- * @param tokenFilter
- *
- * @param dictFile
- * the fully specified filename of the dictionary file to load.
- *
- * @param NumOfInitialDictEntries
- * initial size of hashtable to create
- *
- * @exception java.io.IOException
- * if dictionary file cannot be loaded or some other initialization error occurs.
- */
- public void setDictionary(InputStream dictStream, int initialDictEntries,
- String tokenAnnotationName, String tokenTypeFeatureName, String tokenClassFeatureName,
- String tokenizerDescriptor, TokenFilter tokenFilter, TokenNormalizer tokenNormalizer, String langID)
- throws DictionaryLoaderException {
- term_cnt = 0;
- setTokenAnnotationName(tokenAnnotationName);
- setTokenizerDescriptor(tokenizerDescriptor);
- setTokenFilter(tokenFilter);
- setTokenNormalizer(tokenNormalizer);
- result = new Vector<DictionaryToken>();
-
- getLogger().logInfo("Loading dictionary");
- try {
- adaptor = new AnnotatorAdaptor(getTokenizerDescriptor(), result, tokenAnnotationName,
- tokenFilter, langID);
- adaptor.initCPM();
-
- parser.parse(new InputSource(dictStream));
- } catch (SAXException e) {
- getLogger().logError("Parse error occurred - " + e.getMessage());
- throw new DictionaryLoaderException();
- } catch (IOException e) {
- throw new DictionaryLoaderException();
- }
- getLogger().logInfo("Finished loading " + term_cnt + " entries");
- }
-
- public InputStream getInputStream() throws IOException {
- return dataResource.getInputStream();
- }
-
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.DictionaryResource#NewDictionaryResource(int)
- */
- public DictionaryResource NewDictionaryResource(int initialDictEntries) {
- return new DictionaryResource_impl(initialDictEntries);
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.conceptMapper.support.DictionaryResource#keys()
- */
- public Enumeration<String> keys() {
- return dictImpl.keys();
- }
-
- public String toString() {
- StringBuffer result = new StringBuffer();
-
- Enumeration<String> e = keys();
-
- while (e.hasMoreElements()) {
- String key = e.nextElement();
- result.append("<DictionaryItem key='" + key + "'>\n");
- DictEntriesByLength_impl item = (DictEntriesByLength_impl) getEntries(key);
- result.append(item.toString());
- result.append("</DictionaryItem>\n");
- }
- return result.toString();
- }
-
- public void serializeEntries(FileOutputStream output) throws IOException {
- ObjectOutputStream oos = new ObjectOutputStream(output);
- oos.writeObject(this.dictImpl);
- oos.close();
- }
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper.support.dictionaryResource;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.Properties;
+import java.util.TreeMap;
+import java.util.Vector;
+import java.util.regex.Pattern;
+
+import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
+import org.apache.uima.conceptMapper.Logger;
+import org.apache.uima.conceptMapper.support.dictionaryResource.annotatorAdaptor.AnnotatorAdaptor;
+import org.apache.uima.conceptMapper.support.tokens.TokenFilter;
+import org.apache.uima.conceptMapper.support.tokens.TokenNormalizer;
+import org.apache.uima.resource.DataResource;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.SharedResourceObject;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+/*
+ * @version $Revision: 1.5 $
+ *
+ * @author Michael Tanenblatt <mt...@us.ibm.com> @author (c) Copyright 2005 IBM @author All Rights
+ * Reserved
+ */
+/**
+ * Implementation of a UIMA DictionaryResource
+ */
+
+public class DictionaryResource_impl implements DictionaryResource, SharedResourceObject {
+ /** Dictionary file loader. Uses an XML parser. */
+ protected DictLoader dictLoader;
+
+ /**
+ * Hashtable of first words. Contains a DictEntries object keyed on word string for the first word
+ * of every entry in the specified dictionary.
+ */
+ protected Hashtable<String, DictEntriesByLength> dictImpl;
+
+ /** Initial size of <code>dict</code> */
+ protected static final int NumOfInitialDictEntries = 500000;
+
+ boolean loaded;
+
+ /** Patterns to for matcher to replace SGML &lt; entities */
+ private static final Pattern ltPattern = Pattern.compile("<");
+
+ /** Patterns to for matcher to replace SGML &gt; entities */
+ private static final Pattern gtPattern = Pattern.compile(">");
+
+ /** Patterns to for matcher to replace SGML &apos; entities */
+ private static final Pattern aposPattern = Pattern.compile("'");
+
+ /** Patterns to for matcher to replace SGML &quot; entities */
+ private static final Pattern quotPattern = Pattern.compile(""");
+
+ /** Patterns to for matcher to replace SGML &amp; entities */
+ private static final Pattern ampPattern = Pattern.compile("&");
+
+ /** Configuration parameter key/label for the order independent lookup indicator */
+ public static final String PARAM_ORDERINDEPENDENTLOOKUP = "OrderIndependentLookup";
+
+ private boolean sortElements = false;
+
+ /** Configuration parameter key/label to indicate whether dictionary should be printed upon load */
+ public static final String PARAM_DUMPDICT = "PrintDictionary";
+
+ private boolean dumpDict = false;
+
+ /** Configuration parameter key/label for the case matching string */
+ public static final String PARAM_CASE_MATCH = "caseMatch";
+
+ /** Configuration parameter key/label for the stemmer class spec. If left out, no stemmer is used */
+ public static final String PARAM_STEMMER_CLASS = "Stemmer";
+
+ /**
+ * Configuration parameter key/label for the stemmer dictionary, passed into the stemmer's
+ * initialization method
+ */
+ public static final String PARAM_STEMMER_DICT = "StemmerDictionary";
+
+ private static final String PARAM_LANGID = "LanguageID";
+
+ private static final String DEFAULT_LANGID = "en";
+
+ public int entryNum = 0;
+
+ /**
+ * @param NumOfInitialDictEntries
+ *
+ */
+ public DictionaryResource_impl() {
+ super();
+ dictImpl = new Hashtable<String, DictEntriesByLength>();
+ loaded = false;
+ }
+
+ /**
+ * @param NumOfInitialDictEntries
+ *
+ */
+ public DictionaryResource_impl(int initialDictEntries) {
+ super();
+ dictImpl = new Hashtable<String, DictEntriesByLength>(initialDictEntries);
+ loaded = false;
+ }
+
+ /**
+ * @return Returns the dictLoader.
+ */
+ public DictLoader getDictLoader() {
+ return dictLoader;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.conceptMapper.support.DictionaryResource#getEntry(java.lang.String)
+ */
+ public DictEntriesByLength getEntries(String key) {
+ return dictImpl.get(key);
+ }
+
+ /**
+ * Create a new dictionary entry.
+ *
+ * @param key
+ * the key to index on
+ * @param phrase
+ * the phrase to be entered in the dictionary
+ * @param length
+ * the number of words in the phrase (>=1)
+ * @param props
+ * the properties object for the dictionary entry
+ */
+ public void putEntry(String key, String text, ArrayList<String> elements, String unsorted,
+ int length, Properties props) {
+ DictEntriesByLength entry = getEntries(key);
+
+ if (entry == null) {
+ entry = new DictEntriesByLength_impl();
+ // System.err.println ("DICT adding: " + key + ", " + text);
+ dictImpl.put(key, entry);
+ }
+ // Iterator<String> elemIter = elements.iterator ();
+ // while (elemIter.hasNext ())
+ // {
+ // System.err.println("ENTRY ELEMENT: " + elemIter.next ());
+ // }
+ entry.putEntry(length, text, elements, unsorted, props);
+ // System.err.println("ENTRY: '" + text + "', PROPS: ");
+ // Enumeration propKeys = props.keys();
+ // while (propKeys.hasMoreElements())
+ // {
+ // String propKey = (String) propKeys.nextElement();
+ // System.err.println ("\t" + propKey + ": " + props.getProperty(propKey));
+ // }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.resource.SharedResourceObject#load(org.apache.uima.resource.DataResource)
+ */
+ public void load(DataResource data) throws ResourceInitializationException {
+ // InputStream dictStream = null;
+ try {
+ Logger logger = new Logger("Dictionary Resource Loader", data.getLogger());
+ dictLoader = new DictLoader(logger, this, data);
+
+ /*
+ * At least for now, while info from the AnnotatorContext is not available here, must delay
+ * loading until able to access this info
+ *
+ * //open input stream to data dictStream = data.getInputStream ();
+ *
+ * dictLoader = new DictLoader (data.getLogger (), this); dictLoader.setDictionary
+ * (dictStream, NumOfInitialDictEntries);
+ */
+ } catch (Exception e) {
+ throw new ResourceInitializationException(e);
+ }
+ /*
+ * finally { if (dictStream != null) { try { dictStream.close (); } catch (IOException e) { } } }
+ */
+ }
+
+ public void loadDictionaryContents(AnnotatorContext aContext, Logger logger,
+ String tokenAnnotationName, String tokenTypeFeatureName, String tokenClassFeatureName,
+ String tokenizerDescriptor) throws ResourceInitializationException {
+
+ InputStream dictStream = null;
+ try {
+ dictLoader.setLogger(logger);
+ Boolean sortElementsParam = new Boolean(false);
+
+ sortElementsParam = (Boolean) aContext.getConfigParameterValue(PARAM_ORDERINDEPENDENTLOOKUP);
+ if (sortElementsParam == null) {
+ sortElements = false;
+ } else {
+ sortElements = sortElementsParam.booleanValue();
+ }
+
+ Boolean dumpDictParam = (Boolean) aContext.getConfigParameterValue(PARAM_DUMPDICT);
+ if (dumpDictParam == null) {
+ dumpDict = false;
+ } else {
+ dumpDict = dumpDictParam.booleanValue();
+ }
+
+ TokenNormalizer tokenNormalizer = new TokenNormalizer(aContext, logger);
+
+ // open input stream to data
+ dictStream = dictLoader.getInputStream();
+
+ TokenFilter tokenFilter = new TokenFilter(tokenAnnotationName, tokenTypeFeatureName,
+ tokenClassFeatureName, logger);
+ tokenFilter.initConfig(aContext);
+
+ String langID = (String) aContext.getConfigParameterValue(PARAM_LANGID);
+ if ((langID == null) || (langID == ""))
+ {
+ langID = DEFAULT_LANGID;
+ }
+ // System.out.print ("Loading Dictionary: '" + dictLoader.dataResource.getUri().toString() +
+ // "'...");
+ // System.out.print ("Loading Dictionary...");
+ logger.logInfo("Loading Dictionary...");
+ dictLoader.setDictionary(dictStream, NumOfInitialDictEntries, tokenAnnotationName,
+ tokenTypeFeatureName, tokenClassFeatureName, tokenizerDescriptor, tokenFilter,
+ tokenNormalizer, langID);
+ logger.logInfo("...done");
+ // System.out.println ("done");
+ // System.err.println("NEW DICT:\n" + toString());
+ setLoaded(true);
+ if (dumpDict)
+ {
+ System.err.println("BEGIN DUMPING DICTIONARY:");
+ System.err.println(toString());
+ System.err.println("DONE DUMPING DICTIONARY! ");
+ }
+
+ } catch (Exception e) {
+ throw new ResourceInitializationException(e);
+ } finally {
+ if (dictStream != null) {
+ try {
+ dictStream.close();
+ } catch (IOException e) {
+ }
+ }
+ }
+
+ }
+
+ /**
+ * @return Returns the loaded.
+ */
+ public boolean isLoaded() {
+ return loaded;
+ }
+
+ /**
+ * @param loaded
+ * The loaded to set.
+ */
+ public void setLoaded(boolean loaded) {
+ this.loaded = loaded;
+ }
+
+ private static class DictEntriesByLength_impl implements DictEntriesByLength {
+ private static final long serialVersionUID = -8150386021246495622L;
+
+ private class ReverseOrderIntegerComparator implements Comparator<Integer>, Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -805437355806223406L;
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ public int compare(Integer left, Integer right) {
+ // reverse the order of parameters, to reverse the sorting order
+ return (right.compareTo(left));
+ }
+
+ }
+
+ TreeMap<Integer, DictEntries> entries;
+
+ public DictEntriesByLength_impl() {
+ super();
+ entries = new TreeMap<Integer, DictEntries>(new ReverseOrderIntegerComparator());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntriesByLength#getEntries(int)
+ */
+ public DictEntries getEntries(int length) {
+ return entries.get(new Integer(length));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntriesByLength#putEntry(int,
+ * java.lang.String, java.util.Properties)
+ */
+ public void putEntry(int length, String text, ArrayList<String> elements, String unsorted,
+ Properties props) {
+ DictEntries entry = getEntries(length);
+ if (entry == null) {
+ entry = new DictEntriesImpl();
+ entries.put(new Integer(length), entry);
+ }
+ entry.putEntry(text, elements, unsorted, props);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntriesByLength#getLongest()
+ */
+ public Integer getLongest() {
+ return entries.firstKey();
+ }
+
+ public Integer getShortest() {
+ return entries.lastKey();
+ }
+
+ public String toString() {
+ StringBuffer result = new StringBuffer();
+
+ int i = getLongest().intValue();
+ int last = getShortest().intValue();
+
+ while (i >= last) {
+ DictEntriesImpl entries = (DictEntriesImpl) getEntries(i);
+ if (entries != null) {
+ result.append("<DictEntriesByLength length='" + i + "'>\n");
+ result.append(entries.toString());
+ result.append("</DictEntriesByLength>\n");
+ }
+ i--;
+ }
+ return result.toString();
+ }
+ }
+
+ /**
+ * Private class for storing first words in the dict hashtable.
+ */
+ public static class DictEntriesImpl extends ArrayList<DictEntry> implements DictEntries {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Create a new dictionary entry.
+ */
+ public DictEntriesImpl() {
+ }
+
+ /**
+ * Add a new phrase to an existing dictionary entry.
+ *
+ * @param text
+ * the text to be entered in the dictionary
+ * @param props
+ * the properties object for the phrase
+ */
+ public void putEntry(String text, Collection<String> elements, String unsorted, Properties props) {
+ add(new DictEntryImpl(text, elements, unsorted, props));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntries#getEntry(java.lang.String)
+ */
+ public ArrayList<DictEntry> getEntries() {
+ return this;
+ }
+
+ public String toString() {
+ StringBuffer result = new StringBuffer("<DictEntries>");
+ for (int i = 0; i < size(); i++) {
+ result.append(((DictEntryImpl) get(i)).toString());
+ }
+ result.append("</DictEntries>");
+ return result.toString();
+ }
+
+ }
+
+ public static class DictEntryImpl implements DictEntry {
+ private static final long serialVersionUID = -7723934333674544157L;
+
+ String text;
+
+ Collection<String> elements;
+
+ String unsorted;
+
+ Properties properties;
+
+ /**
+ * @param text
+ * @param elements
+ * @param unsorted
+ * @param properties
+ */
+ public DictEntryImpl(String text, Collection<String> elements, String unsorted,
+ Properties properties) {
+ super();
+ this.properties = properties;
+ this.unsorted = unsorted;
+ this.text = text;
+ this.elements = elements;
+ }
+
+ /**
+ * @return Returns the text.
+ */
+ public String getText() {
+ return text;
+ }
+
+ /**
+ * @param text
+ * The text to set.
+ */
+ public void setText(String text) {
+ this.text = text;
+ }
+
+ /**
+ * @param properties
+ * The properties to set.
+ */
+ public void setProperties(Properties properties) {
+ this.properties = properties;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry#getProperties()
+ */
+ public Properties getProperties() {
+ return properties;
+ }
+
+ public String toString() {
+ StringBuffer result = new StringBuffer("<DictEntry Text ='" + getText() + "'>");
+ Enumeration<?> e = getProperties().propertyNames();
+
+ while (e.hasMoreElements()) {
+ String propName = (String) e.nextElement();
+ result.append("<property name='" + propName + "'>");
+ String item = getProperties().getProperty(propName);
+ result.append(item);
+ result.append("</property>\n");
+ }
+ result.append("</DictEntry>\n");
+ return result.toString();
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry#getUnsorted()
+ */
+ public String getUnsorted() {
+ return unsorted;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry#setUnsorted(java.lang.String)
+ */
+ public void setUnsorted(String unsorted) {
+ this.unsorted = unsorted;
+ }
+
+ public Collection<String> getElements() {
+ return elements;
+ }
+
+ public void setElements(Collection<String> elements) {
+ this.elements = elements;
+ }
+
+ }
+
+ /**
+ * Convert character entities in a string to the corresponding character. The set of entities
+ * handled includes:
+ * <ul>
+ * <li>&amp;
+ * <li>&lt;
+ * <li>&gt;
+ * <li>&apos;
+ * <li>&quot;
+ * </ul>
+ *
+ * @param input
+ * the string to process.
+ *
+ * @return the string with converted entities
+ */
+ protected String convertEntities(String input) {
+ String result = ltPattern.matcher(input).replaceAll("<");
+ result = gtPattern.matcher(result).replaceAll(">");
+ result = aposPattern.matcher(result).replaceAll("'");
+ result = quotPattern.matcher(result).replaceAll("\"");
+ result = ampPattern.matcher(result).replaceAll("&");
+
+ return (result);
+ }
+
+ /**
+ * Private class to load the dictionary file. Extends the org.xml.sax.helpers.DefaultHandler for
+ * XML parsing.
+ */
+ private class DictLoader extends DefaultHandler {
+ /** Default parser name. */
+ protected static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
+
+ /** Default name of element that contains dictionary records. */
+ protected static final String DEFAULT_TOKEN_ELEM = "token";
+
+ /** Default name of element that contains variant form */
+ protected static final String DEFAULT_VARIANT_ELEM = "variant";
+
+ /**
+ * Default name of attribute in the token element that contains the key for the entry (i.e., the
+ * surface or variant form that will be found in the document).
+ */
+ protected static final String DEFAULT_KEY_ATTR = "base";
+
+ /** Default name of element that contains part of speech */
+ // protected static final String DEFAULT_POS_ATTR = "POS";
+ /** The XML parser that parses the dictionary file. */
+ private XMLReader parser = null;
+
+ /** The name of the element that contains the dictionary records. */
+ private String token_elem = DEFAULT_TOKEN_ELEM;
+
+ /**
+ * The name of the element that contains a variant form for the current entry
+ */
+ private String variant_elem = DEFAULT_VARIANT_ELEM;
+
+ /** The name of the attribute that contains the key. */
+ private String key_attribute = DEFAULT_KEY_ATTR;
+
+ /** The name of the attribute that contains the part of speech. */
+ // private String pos_attribute = DEFAULT_POS_ATTR;
+ /** Count of number of dictionary entries loaded. */
+ private int term_cnt = 0;
+
+ /** Reference to UIMA logger. */
+ private Logger logger;
+
+ /** The hashtable built while parsing the dictionary file. */
+ private DictionaryResource dict;
+
+ /** Properties for current canonical form */
+ private Properties props;
+
+ private AnnotatorAdaptor adaptor;
+
+ private String tokenAnnotationName;
+
+ private String tokenizerDescriptor;
+
+ private TokenFilter tokenFilter;
+
+ private TokenNormalizer tokenNormalizer;
+
+ /**
+ * needed to access input stream, since cannot load external dict resource until TAE config
+ * params are available to set up tokenizer correctly
+ */
+ DataResource dataResource;
+
+ private Vector<DictionaryToken> result;
+
+ /**
+ * Create a dictionary loader.
+ *
+ * @param The
+ * UIMA logger.
+ *
+ * @throws Exception
+ * if XML parser cannot be created.
+ */
+ public DictLoader(Logger log, DictionaryResource dict, DataResource data) throws Exception {
+ this.setLogger(log);
+ this.dict = dict;
+ this.dataResource = data;
+
+ // create parser
+ try {
+ parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME);
+ } catch (Exception e) {
+ log.logError("Unable to instantiate dictionary parser (" + DEFAULT_PARSER_NAME + ")");
+ throw (e);
+ }
+ parser.setContentHandler(this);
+ parser.setErrorHandler(this);
+
+ }
+
+ protected String getTokenAnnotationName() {
+ return tokenAnnotationName;
+ }
+
+ protected void setTokenAnnotationName(String tokenAnnotationName) {
+ this.tokenAnnotationName = tokenAnnotationName;
+ }
+
+ protected String getTokenizerDescriptor() {
+ return tokenizerDescriptor;
+ }
+
+ protected void setTokenizerDescriptor(String tokenizerDescriptor) {
+ this.tokenizerDescriptor = tokenizerDescriptor;
+ }
+
+ protected void setTokenFilter(TokenFilter tokenFilter) {
+ this.tokenFilter = tokenFilter;
+ }
+
+ protected TokenFilter getTokenFilter() {
+ return tokenFilter;
+ }
+
+ protected void setTokenNormalizer(TokenNormalizer tokenNormalizer) {
+ this.tokenNormalizer = tokenNormalizer;
+ }
+
+ protected TokenNormalizer getTokenNormalizer() {
+ return tokenNormalizer;
+ }
+
+ /**
+ * @return Returns the log.
+ */
+ public Logger getLogger() {
+ return logger;
+ }
+
+ /**
+ * @param log
+ * The log to set.
+ */
+ public void setLogger(Logger logger) {
+ this.logger = logger;
+ }
+
+ /**
+ * Start element. This method does most of the work of building the hashtable.
+ *
+ */
+ public void startElement(String uri, String local, String raw, Attributes attrs)
+ throws SAXException {
+
+ String key = null;
+ DictionaryToken token = null;
+ int length = 0;
+
+ if (raw.equals(token_elem)) { // starting new token entry
+ if (attrs != null) {
+
+ props = new Properties();
+ int attrCount = attrs.getLength();
+ for (int i = 0; i < attrCount; i++) {
+ props.setProperty(attrs.getQName(i), convertEntities(attrs.getValue(i)));
+ }
+ }
+ } else if (raw.equals(variant_elem)) { // variant for current token
+ if (attrs != null) {
+ int attrCount = attrs.getLength();
+ ArrayList<String> tokens = new ArrayList<String>();
+
+ // if this variant contains its own POS info, save token level POS info and set props to
+ // contain variant's
+ Properties variantProperties = new Properties(props);
+ // logger.logInfo("" + entryNum++);
+ //
+ // System.err.println("" + entryNum++);
+ for (int i = 0; i < attrCount; i++) {
+ if (attrs.getQName(i).equals(key_attribute)) { // key attribute?
+
+ adaptor.runCPM(convertEntities(attrs.getValue(i)));
+
+ //if (dumpDict)
+ //{
+ // System.err.println ("Adaptor done, result size: " + result.size() + "\nresult:\n");
+ // for (DictionaryToken resultItem : result)
+ // {
+ // System.err.println (" " + resultItem.getText());
+ // }
+ //}
+
+ Iterator<DictionaryToken> tokenIter = result.iterator();
+ token = null;
+
+ while (tokenIter.hasNext()) {
+ token = (DictionaryToken) tokenIter.next();
+ //if (dumpDict)
+ //{
+ // System.err.println ("TOKEN CLASS: '" + token.getTokenClass() + "', TOKEN TYPE: '" + token.getType() + "'");
+ //}
+ if (tokenFilter.isOK_Token(token, tokenNormalizer)) {
+ break;
+ }
+ }
+
+ if (token == null) {
+ return;
+ }
+
+ key = new String(tokenNormalizer.normalize(token.getText()));
+ //if (dumpDict)
+ //{
+ // System.err.println ("variant token key:" + key);
+ //}
+
+ tokens.add(key);
+ length = 1;
+ while (tokenIter.hasNext()) {
+ token = (DictionaryToken) tokenIter.next();
+ String tokenText = tokenNormalizer.normalize(token.getText());
+
+ if (tokenFilter.isOK_Token(token, tokenNormalizer)) {
+ key += " " + tokenText;
+ tokens.add(tokenText);
+ length++;
+
+ //if (dumpDict)
+ //{
+ // System.err.println (" variant token aux:" + tokenText);
+ //}
+ }
+ //else
+ //{
+ // if (dumpDict)
+ // {
+ // System.err.println (" SKIPPING: variant token aux:" + tokenText);
+ // }
+ //}
+ }
+ } else {
+ variantProperties.setProperty(attrs.getQName(i), convertEntities(attrs.getValue(i)));
+ }
+
+ }
+
+ String[] elements = (String[]) tokens.toArray(new String[tokens.size()]);
+
+ String unsorted = null;
+
+ if (sortElements) {
+ unsorted = stringTogetherTokens(elements);
+ Arrays.sort(elements);
+ }
+
+ String tokenString = stringTogetherTokens(elements);
+ //if (dumpDict)
+ //{
+ // System.err.println ("token string: " + tokenString);
+ //}
+
+
+ // add to dictionary
+ if (sortElements) {
+ for (int i = 0; i < tokens.size(); i++) {
+ dict.putEntry((String) tokens.get(i), tokenString, tokens, unsorted, tokens.size(),
+ variantProperties);
+ // System.err.println ("adding props for:" + tokenString);
+ // Enumeration propKeys = variantProperties.keys();
+ // while (propKeys.hasMoreElements())
+ // {
+ // String propKey = (String) propKeys.nextElement();
+ // System.err.println ("\t" + propKey + ": " +
+ // variantProperties.getProperty(propKey));
+ // }
+
+ }
+ } else {
+ dict.putEntry((String) tokens.get(0), tokenString, tokens, unsorted, tokens.size(),
+ variantProperties);
+ }
+ term_cnt++;
+ if ((term_cnt % 10000) == 0) {
+ getLogger().logInfo("processed " + term_cnt + " entries");
+ //System.err.println("processed " + term_cnt + " entries");
+ }
+ }
+ }
+ }
+
+ /**
+ * @param elements
+ * @return
+ */
+ private String stringTogetherTokens(String[] elements) {
+ StringBuffer tokenString = new StringBuffer();
+
+ for (int i = 0; i < elements.length; i++) {
+ if (i > 0) {
+ tokenString.append(" ");
+ }
+ tokenString.append(elements[i]);
+ }
+ return tokenString.toString();
+ }
+
+ //
+ // ErrorHandler methods
+ //
+ /** Warning. */
+ public void warning(SAXParseException ex) throws SAXException {
+ getLogger().logWarning(errorString("Warning", ex));
+ } // warning(SAXParseException)
+
+ /**
+ * Error. public void error (SAXParseException ex) throws SAXException { getLogger ().log
+ * (Level.SEVERE, errorString ("Error", ex)); } // error(SAXParseException)
+ */
+ /** Fatal error. */
+ public void fatalError(SAXParseException ex) throws SAXException {
+ getLogger().logError(errorString("Fatal Error", ex));
+ throw ex;
+ } // fatalError(SAXParseException)
+
+ /** Prints the error message. */
+ protected String errorString(String type, SAXParseException ex) {
+ String errorMsg = "[" + type + "]";
+ if (ex == null) {
+ return errorMsg + "!!!";
+ }
+ String systemId = ex.getSystemId();
+ if (systemId != null) {
+ int index = systemId.lastIndexOf('/');
+ if (index != -1)
+ systemId = systemId.substring(index + 1);
+ errorMsg += systemId;
+ }
+ errorMsg += ":" + ex.getLineNumber() + ":" + ex.getColumnNumber() + ": " + ex.getMessage();
+ return errorMsg;
+ }
+
+ /**
+ * Use the tokenizer specified in 'tokenizerDescriptor' to load the specified dicitonary file.
+ * The dictonary file must be in the format specified above. A new <code>dict</code> hashtable
+ * is created with a <code>DictEntries</code> object for each unique first word in the base
+ * fields in the dictionary file. The <code>dict</code> hashtable is keyed off of the first
+ * word. The <code>DictEntries</code> for a first word contains a hashtable of
+ * <code>DictEntry</code> objects for every phrase in the base fields of the dictionary file
+ * started by the first word. The phrase hashtable is keyed off of the entire phrase.
+ *
+ * @param tokenizerDescriptor
+ * @param tokenAnnotationName
+ * @param tokenFilter
+ *
+ * @param dictFile
+ * the fully specified filename of the dictionary file to load.
+ *
+ * @param NumOfInitialDictEntries
+ * initial size of hashtable to create
+ *
+ * @exception java.io.IOException
+ * if dictionary file cannot be loaded or some other initialization error occurs.
+ */
+ public void setDictionary(InputStream dictStream, int initialDictEntries,
+ String tokenAnnotationName, String tokenTypeFeatureName, String tokenClassFeatureName,
+ String tokenizerDescriptor, TokenFilter tokenFilter, TokenNormalizer tokenNormalizer, String langID)
+ throws DictionaryLoaderException {
+ term_cnt = 0;
+ setTokenAnnotationName(tokenAnnotationName);
+ setTokenizerDescriptor(tokenizerDescriptor);
+ setTokenFilter(tokenFilter);
+ setTokenNormalizer(tokenNormalizer);
+ result = new Vector<DictionaryToken>();
+
+ getLogger().logInfo("Loading dictionary");
+ try {
+ adaptor = new AnnotatorAdaptor(getTokenizerDescriptor(), result, tokenAnnotationName,
+ tokenFilter, langID, getLogger());
+ adaptor.initCPM();
+
+ parser.parse(new InputSource(dictStream));
+ } catch (SAXException e) {
+ getLogger().logError("Parse error occurred - " + e.getMessage());
+ throw new DictionaryLoaderException();
+ } catch (IOException e) {
+ throw new DictionaryLoaderException();
+ }
+ getLogger().logInfo("Finished loading " + term_cnt + " entries");
+ }
+
+ public InputStream getInputStream() throws IOException {
+ return dataResource.getInputStream();
+ }
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.DictionaryResource#NewDictionaryResource(int)
+ */
+ public DictionaryResource NewDictionaryResource(int initialDictEntries) {
+ return new DictionaryResource_impl(initialDictEntries);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.uima.conceptMapper.support.DictionaryResource#keys()
+ */
+ public Enumeration<String> keys() {
+ return dictImpl.keys();
+ }
+
+ public String toString() {
+ StringBuffer result = new StringBuffer();
+
+ Enumeration<String> e = keys();
+
+ while (e.hasMoreElements()) {
+ String key = e.nextElement();
+ result.append("<DictionaryItem key='" + key + "'>\n");
+ DictEntriesByLength_impl item = (DictEntriesByLength_impl) getEntries(key);
+ result.append(item.toString());
+ result.append("</DictionaryItem>\n");
+ }
+ return result.toString();
+ }
+
+ public void serializeEntries(FileOutputStream output) throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(output);
+ oos.writeObject(this.dictImpl);
+ oos.close();
+ }
+
}
\ No newline at end of file
Modified: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/annotatorAdaptor/AnnotatorAdaptor.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/annotatorAdaptor/AnnotatorAdaptor.java?rev=766019&r1=766018&r2=766019&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/annotatorAdaptor/AnnotatorAdaptor.java (original)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/annotatorAdaptor/AnnotatorAdaptor.java Fri Apr 17 14:08:15 2009
@@ -1,129 +1,139 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.uima.conceptMapper.support.dictionaryResource.annotatorAdaptor;
-
-import java.io.IOException;
-import java.util.Vector;
-
-import org.apache.uima.UIMAFramework;
-import org.apache.uima.analysis_engine.AnalysisEngine;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.cas.Feature;
-import org.apache.uima.cas.Type;
-import org.apache.uima.cas.TypeSystem;
-import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryLoaderException;
-import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryToken;
-import org.apache.uima.conceptMapper.support.tokens.TokenFilter;
-import org.apache.uima.conceptMapper.support.tokens.UnknownTypeException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceSpecifier;
-import org.apache.uima.util.InvalidXMLException;
-import org.apache.uima.util.XMLInputSource;
-
-public class AnnotatorAdaptor {
- private ResourceSpecifier aeSpecifier;
-
- private AnalysisEngine ae;
-
- private CAS cas;
-
- private Vector<DictionaryToken> result;
-
- private TokenFilter tokenFilter;
-
- private String tokenTypeName;
-
- private Type tokenType;
- private Feature tokenTypeFeature;
- private Feature tokenClassFeature;
-
- // private static Set<Type> tokenAnnotationSubTypes = null;
- private boolean typeSystemInitialized = false;
-
- private String langID;
-
- public AnnotatorAdaptor(String analysisEngineDescriptorPath, Vector<DictionaryToken> result,
- String tokenTypeName, TokenFilter tokenFilter, String langID)
- throws DictionaryLoaderException {
- super();
- try {
- aeSpecifier = UIMAFramework.getXMLParser().parseResourceSpecifier(
- new XMLInputSource(analysisEngineDescriptorPath));
- this.tokenTypeName = tokenTypeName;
- this.tokenTypeFeature = tokenFilter.getTokenTypeFeature();
- this.tokenClassFeature = tokenFilter.getTokenClassFeature();
- this.tokenFilter = tokenFilter;
- this.langID = langID;
- this.result = result;
- } catch (InvalidXMLException e) {
- throw new DictionaryLoaderException(e);
- } catch (IOException e) {
- throw new DictionaryLoaderException(e);
- }
- }
-
- public void initCPM() throws DictionaryLoaderException {
- try {
- ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
- cas = ae.newCAS();
- } catch (ResourceInitializationException e) {
- throw new DictionaryLoaderException(e);
- }
- }
-
- public void runCPM(String text) {
- cas.setDocumentText(text);
- cas.setDocumentLanguage(langID);
- try {
- ae.process(cas);
- } catch (AnalysisEngineProcessException e) {
- e.printStackTrace();
- }
- processCAS(cas);
- cas.reset();
- }
-
- private void processCAS(CAS cas) {
- if (cas == null) {
- } else {
- try {
- // System.err.println ("processCAS(), doc text: " + cas.getDocumentText());
- result.clear();
- if (!typeSystemInitialized) {
- TypeSystem typeSystem = cas.getTypeSystem();
- tokenFilter.initTypes(typeSystem, false);
- tokenType = typeSystem.getType(tokenTypeName);
- typeSystemInitialized = true;
- }
-
- FSIterator tokenIter = cas.getIndexRepository().getAllIndexedFS(tokenType);
- while (tokenIter.hasNext()) {
- AnnotationFS annotation = (AnnotationFS) tokenIter.next();
- result.add(new DictionaryToken(annotation, tokenTypeFeature, tokenClassFeature));
- }
- } catch (UnknownTypeException e) {
- System.err.println(e.getMessage());
- e.printStackTrace();
- }
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper.support.dictionaryResource.annotatorAdaptor;
+
+import java.io.IOException;
+import java.util.Vector;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.conceptMapper.Logger;
+import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryLoaderException;
+import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryToken;
+import org.apache.uima.conceptMapper.support.tokens.TokenFilter;
+import org.apache.uima.conceptMapper.support.tokens.UnknownTypeException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+
+public class AnnotatorAdaptor {
+ private ResourceSpecifier aeSpecifier;
+
+ private AnalysisEngine ae;
+
+ private CAS cas;
+
+ private Vector<DictionaryToken> result;
+
+ private TokenFilter tokenFilter;
+
+ private String tokenTypeName;
+
+ private Type tokenType;
+ private Feature tokenTypeFeature;
+ private Feature tokenClassFeature;
+
+ // private static Set<Type> tokenAnnotationSubTypes = null;
+ private boolean typeSystemInitialized = false;
+
+ private String langID;
+
+ private Logger logger;
+
+ public AnnotatorAdaptor(String analysisEngineDescriptorPath, Vector<DictionaryToken> result,
+ String tokenTypeName, TokenFilter tokenFilter, String langID, Logger logger)
+ throws DictionaryLoaderException {
+ super();
+ try {
+ aeSpecifier = UIMAFramework.getXMLParser().parseResourceSpecifier(
+ new XMLInputSource(analysisEngineDescriptorPath));
+ this.tokenTypeName = tokenTypeName;
+ this.tokenTypeFeature = tokenFilter.getTokenTypeFeature();
+ this.tokenClassFeature = tokenFilter.getTokenClassFeature();
+ this.tokenFilter = tokenFilter;
+ this.langID = langID;
+ this.result = result;
+ this.logger = logger;
+ } catch (InvalidXMLException e) {
+ throw new DictionaryLoaderException(e);
+ } catch (IOException e) {
+ throw new DictionaryLoaderException(e);
+ }
+ }
+
+ public void initCPM() throws DictionaryLoaderException {
+ try {
+ ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+ cas = ae.newCAS();
+ } catch (ResourceInitializationException e) {
+ throw new DictionaryLoaderException(e);
+ }
+ }
+
+ public void runCPM(String text) {
+ cas.setDocumentText(text);
+ cas.setDocumentLanguage(langID);
+ try {
+ ae.process(cas);
+ } catch (AnalysisEngineProcessException e) {
+ e.printStackTrace();
+ }
+ processCAS(cas);
+ cas.reset();
+ }
+
+ private void processCAS(CAS cas) {
+ if (cas == null) {
+ } else {
+ try {
+ // System.err.println ("processCAS(), doc text: " + cas.getDocumentText());
+ result.clear();
+ if (!typeSystemInitialized) {
+ TypeSystem typeSystem = cas.getTypeSystem();
+ tokenFilter.initTypes(typeSystem, false);
+ tokenType = typeSystem.getType(tokenTypeName);
+ typeSystemInitialized = true;
+ }
+
+ FSIterator tokenIter = cas.getIndexRepository().getAllIndexedFS(tokenType);
+ // System.err.println ("any tokens found? " + tokenIter.hasNext ());
+ while (tokenIter.hasNext()) {
+ AnnotationFS annotation = (AnnotationFS) tokenIter.next();
+ result.add(new DictionaryToken(annotation, tokenTypeFeature, tokenClassFeature));
+ }
+ if (result.size () == 0)
+ {
+ // System.err.println ("Dictionary tokenization of: '" + cas.getDocumentText() + "' produced no tokens of type: '" + tokenType.getName () + "'");
+ logger.logWarning ("Dictionary tokenization of: '" + cas.getDocumentText() + "' produced no tokens of type: '" + tokenType.getName () + "'");
+ }
+ } catch (UnknownTypeException e) {
+ System.err.println(e.getMessage());
+ e.printStackTrace();
+ }
+ }
+ }
+}