You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2016/01/27 13:35:44 UTC

svn commit: r1727018 - /manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java

Author: kwright
Date: Wed Jan 27 12:35:44 2016
New Revision: 1727018

URL: http://svn.apache.org/viewvc?rev=1727018&view=rev
Log:
Do first N characters.

Modified:
    manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java

Modified: manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java?rev=1727018&r1=1727017&r2=1727018&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java (original)
+++ manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java Wed Jan 27 12:35:44 2016
@@ -166,7 +166,7 @@ public class OpenNlpExtractor extends Ba
       String description = null;
       Long length = null;
 
-      final MetadataAccumulator ma = new MetadataAccumulator(sp);
+      final MetadataAccumulator ma = new MetadataAccumulator(sp, document.getBinaryLength());
       
       try {
 
@@ -214,15 +214,6 @@ public class OpenNlpExtractor extends Ba
           length = new Long(ds.getBinaryLength());
         }
 
-        // Check to be sure downstream pipeline will accept document of specified length
-        if (!activities.checkLengthIndexable(ds.getBinaryLength()))
-        {
-          activities.noDocument();
-          resultCode = activities.EXCLUDED_LENGTH;
-          description = "Downstream pipeline rejected document with length "+ds.getBinaryLength();
-          return DOCUMENTSTATUS_REJECTED;
-        }
-
       }
       finally
       {
@@ -231,6 +222,8 @@ public class OpenNlpExtractor extends Ba
           resultCode, description);
       }
       
+      ma.done();
+      
       // Parsing complete!
       // Create a copy of Repository Document
       RepositoryDocument docCopy = document.duplicate();
@@ -244,11 +237,11 @@ public class OpenNlpExtractor extends Ba
         docCopy.setBinary(is,newBinaryLength);
 
         // add named entity meta-data
-        Map<String,List<String>> nerMap = ma.getMetadata();
+        Map<String,Set<String>> nerMap = ma.getMetadata();
         if (!nerMap.isEmpty()) {
-          for (Entry<String, List<String>> entry : nerMap.entrySet()) {
-            List<String> neList = entry.getValue();
-            String[] neArray = neList.toArray(new String[neList.size()]);
+          for (Entry<String, Set<String>> entry : nerMap.entrySet()) {
+            Set<String> neList = entry.getValue();
+            String[] neArray = neList.toArray(new String[0]);
             docCopy.addField(entry.getKey(), neArray);
           }
         }
@@ -535,22 +528,30 @@ public class OpenNlpExtractor extends Ba
     throw new ManifoldCFException(e.getMessage(),e);
   }
 
+  protected static int maximumExtractionCharacters = 524288;
+  
   /** An instance of this class receives characters in 64K chunks, and needs to accumulate
   * extracted metadata that this transformer will pass down.
   */
   protected class MetadataAccumulator {
 
+    char[] characterBuffer = null;
+    int bufferPointer = 0;
+    
+    final int bufferSize;
+    
     final SentenceDetector sentenceDetector;
     final Tokenizer tokenizer;
     final NameFinderME peopleFinder;
     final NameFinderME locationFinder;
     final NameFinderME organizationFinder;
     
-    final List<String> peopleList = new ArrayList<>();
-    final List<String> locationsList = new ArrayList<>();
-    final List<String> organizationsList = new ArrayList<>();
+    final Set<String> peopleList = new HashSet<>();
+    final Set<String> locationsList = new HashSet<>();
+    final Set<String> organizationsList = new HashSet<>();
     
-    public MetadataAccumulator(final SpecPacker sp)
+    public MetadataAccumulator(final SpecPacker sp,
+      final long bytesize)
       throws ManifoldCFException {
       try {
         sentenceDetector = OpenNlpExtractorConfig.sentenceDetector(sp.getSModelPath());
@@ -561,31 +562,41 @@ public class OpenNlpExtractor extends Ba
       } catch (IOException e) {
         throw new ManifoldCFException(e.getMessage(), e);
       }
+      if (bytesize > maximumExtractionCharacters) {
+        bufferSize = maximumExtractionCharacters;
+      } else {
+        bufferSize = (int)bytesize;
+      }
     }
     
     /** Accept characters, including actual count.
     */
     public void acceptCharacters(final char[] buffer, int amt) {
-      // MHL
-    }
-    
-    public Map<String,List<String>> getMetadata() {
-      final Map<String, List<String>> nerMap = new HashMap<>();
-      nerMap.put(PERSONS, peopleList);
-      nerMap.put(LOCATIONS, locationsList);
-      nerMap.put(ORGANIZATIONS, organizationsList);
-      return nerMap;
+      if (characterBuffer == null) {
+        characterBuffer = new char[bufferSize];
+      }
+      int copyAmt;
+      if (amt > bufferSize - bufferPointer) {
+        copyAmt = bufferSize - bufferPointer;
+      } else {
+        copyAmt = amt;
+      }
+      int sourcePtr = 0;
+      while (copyAmt > 0) {
+        characterBuffer[bufferPointer++] = buffer[sourcePtr++];
+        copyAmt--;
+      }
     }
-    
-  }
-  
-  /*
-      The following logic needs to be added back in, but with rolling character buffers and duplicate sentence detection...
-  
-      List<String> peopleList = new ArrayList<>();
-      List<String> locationsList = new ArrayList<>();
-      List<String> organizationsList = new ArrayList<>();
 
+    public void done() {
+      if (bufferPointer == 0 || characterBuffer == null) {
+        return;
+      }
+      
+      // Make a string freom the character array
+      final String textContent = new String(characterBuffer, 0, bufferPointer);
+
+      // Break into sentences, tokens, and then people, locations, and organizations
       String[] sentences = sentenceDetector.sentDetect(textContent);
       for (String sentence : sentences) {
         String[] tokens = tokenizer.tokenize(sentence);
@@ -598,10 +609,18 @@ public class OpenNlpExtractor extends Ba
 
         spans = organizationFinder.find(tokens);
         organizationsList.addAll(Arrays.asList(Span.spansToStrings(spans, tokens)));
-
       }
-
-  */
+    }
+    
+    public Map<String,Set<String>> getMetadata() {
+      final Map<String, Set<String>> nerMap = new HashMap<>();
+      nerMap.put(PERSONS, peopleList);
+      nerMap.put(LOCATIONS, locationsList);
+      nerMap.put(ORGANIZATIONS, organizationsList);
+      return nerMap;
+    }
+    
+  }
   
   protected static interface DestinationStorage {
     /** Get the output stream to write to.  Caller should explicitly close this stream when done writing.