You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2016/01/27 13:35:44 UTC
svn commit: r1727018 -
/manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
Author: kwright
Date: Wed Jan 27 12:35:44 2016
New Revision: 1727018
URL: http://svn.apache.org/viewvc?rev=1727018&view=rev
Log:
Do first N characters.
Modified:
manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
Modified: manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java?rev=1727018&r1=1727017&r2=1727018&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java (original)
+++ manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java Wed Jan 27 12:35:44 2016
@@ -166,7 +166,7 @@ public class OpenNlpExtractor extends Ba
String description = null;
Long length = null;
- final MetadataAccumulator ma = new MetadataAccumulator(sp);
+ final MetadataAccumulator ma = new MetadataAccumulator(sp, document.getBinaryLength());
try {
@@ -214,15 +214,6 @@ public class OpenNlpExtractor extends Ba
length = new Long(ds.getBinaryLength());
}
- // Check to be sure downstream pipeline will accept document of specified length
- if (!activities.checkLengthIndexable(ds.getBinaryLength()))
- {
- activities.noDocument();
- resultCode = activities.EXCLUDED_LENGTH;
- description = "Downstream pipeline rejected document with length "+ds.getBinaryLength();
- return DOCUMENTSTATUS_REJECTED;
- }
-
}
finally
{
@@ -231,6 +222,8 @@ public class OpenNlpExtractor extends Ba
resultCode, description);
}
+ ma.done();
+
// Parsing complete!
// Create a copy of Repository Document
RepositoryDocument docCopy = document.duplicate();
@@ -244,11 +237,11 @@ public class OpenNlpExtractor extends Ba
docCopy.setBinary(is,newBinaryLength);
// add named entity meta-data
- Map<String,List<String>> nerMap = ma.getMetadata();
+ Map<String,Set<String>> nerMap = ma.getMetadata();
if (!nerMap.isEmpty()) {
- for (Entry<String, List<String>> entry : nerMap.entrySet()) {
- List<String> neList = entry.getValue();
- String[] neArray = neList.toArray(new String[neList.size()]);
+ for (Entry<String, Set<String>> entry : nerMap.entrySet()) {
+ Set<String> neList = entry.getValue();
+ String[] neArray = neList.toArray(new String[0]);
docCopy.addField(entry.getKey(), neArray);
}
}
@@ -535,22 +528,30 @@ public class OpenNlpExtractor extends Ba
throw new ManifoldCFException(e.getMessage(),e);
}
+ protected static int maximumExtractionCharacters = 524288;
+
/** An instance of this class receives characters in 64K chunks, and needs to accumulate
* extracted metadata that this transformer will pass down.
*/
protected class MetadataAccumulator {
+ char[] characterBuffer = null;
+ int bufferPointer = 0;
+
+ final int bufferSize;
+
final SentenceDetector sentenceDetector;
final Tokenizer tokenizer;
final NameFinderME peopleFinder;
final NameFinderME locationFinder;
final NameFinderME organizationFinder;
- final List<String> peopleList = new ArrayList<>();
- final List<String> locationsList = new ArrayList<>();
- final List<String> organizationsList = new ArrayList<>();
+ final Set<String> peopleList = new HashSet<>();
+ final Set<String> locationsList = new HashSet<>();
+ final Set<String> organizationsList = new HashSet<>();
- public MetadataAccumulator(final SpecPacker sp)
+ public MetadataAccumulator(final SpecPacker sp,
+ final long bytesize)
throws ManifoldCFException {
try {
sentenceDetector = OpenNlpExtractorConfig.sentenceDetector(sp.getSModelPath());
@@ -561,31 +562,41 @@ public class OpenNlpExtractor extends Ba
} catch (IOException e) {
throw new ManifoldCFException(e.getMessage(), e);
}
+ if (bytesize > maximumExtractionCharacters) {
+ bufferSize = maximumExtractionCharacters;
+ } else {
+ bufferSize = (int)bytesize;
+ }
}
/** Accept characters, including actual count.
*/
public void acceptCharacters(final char[] buffer, int amt) {
- // MHL
- }
-
- public Map<String,List<String>> getMetadata() {
- final Map<String, List<String>> nerMap = new HashMap<>();
- nerMap.put(PERSONS, peopleList);
- nerMap.put(LOCATIONS, locationsList);
- nerMap.put(ORGANIZATIONS, organizationsList);
- return nerMap;
+ if (characterBuffer == null) {
+ characterBuffer = new char[bufferSize];
+ }
+ int copyAmt;
+ if (amt > bufferSize - bufferPointer) {
+ copyAmt = bufferSize - bufferPointer;
+ } else {
+ copyAmt = amt;
+ }
+ int sourcePtr = 0;
+ while (copyAmt > 0) {
+ characterBuffer[bufferPointer++] = buffer[sourcePtr++];
+ copyAmt--;
+ }
}
-
- }
-
- /*
- The following logic needs to be added back in, but with rolling character buffers and duplicate sentence detection...
-
- List<String> peopleList = new ArrayList<>();
- List<String> locationsList = new ArrayList<>();
- List<String> organizationsList = new ArrayList<>();
+ public void done() {
+ if (bufferPointer == 0 || characterBuffer == null) {
+ return;
+ }
+
+ // Make a string freom the character array
+ final String textContent = new String(characterBuffer, 0, bufferPointer);
+
+ // Break into sentences, tokens, and then people, locations, and organizations
String[] sentences = sentenceDetector.sentDetect(textContent);
for (String sentence : sentences) {
String[] tokens = tokenizer.tokenize(sentence);
@@ -598,10 +609,18 @@ public class OpenNlpExtractor extends Ba
spans = organizationFinder.find(tokens);
organizationsList.addAll(Arrays.asList(Span.spansToStrings(spans, tokens)));
-
}
-
- */
+ }
+
+ public Map<String,Set<String>> getMetadata() {
+ final Map<String, Set<String>> nerMap = new HashMap<>();
+ nerMap.put(PERSONS, peopleList);
+ nerMap.put(LOCATIONS, locationsList);
+ nerMap.put(ORGANIZATIONS, organizationsList);
+ return nerMap;
+ }
+
+ }
protected static interface DestinationStorage {
/** Get the output stream to write to. Caller should explicitly close this stream when done writing.