You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/08 12:24:27 UTC

svn commit: r1443940 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: BOMEncodingDetector.java SingleByteReceiver.java SingleCharacterReceiver.java

Author: kwright
Date: Fri Feb  8 11:24:26 2013
New Revision: 1443940

URL: http://svn.apache.org/r1443940
Log:
Finish BOMEncodingDetector.

Modified:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleByteReceiver.java
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleCharacterReceiver.java

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java?rev=1443940&r1=1443939&r2=1443940&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMEncodingDetector.java Fri Feb  8 11:24:26 2013
@@ -20,6 +20,7 @@ package org.apache.manifoldcf.core.fuzzy
 
 import org.apache.manifoldcf.core.interfaces.*;
 import java.util.*;
+import java.io.*;
 
 /** This class represents the parse state of the BOM (byte order mark) parser.
 * The byte order mark parser looks for a byte order mark at the start of a byte sequence,
@@ -120,6 +121,7 @@ public class BOMEncodingDetector extends
       if (theByte == 0xff)
       {
         // UTF-16BE detected
+        mark();
         return establishEncoding("UTF-16BE");
       }
       else
@@ -136,6 +138,7 @@ public class BOMEncodingDetector extends
       if (theByte == 0xbf)
       {
         // Encoding detected as utf-8
+        mark();
         return establishEncoding("UTF-8");
       }
       else
@@ -149,6 +152,7 @@ public class BOMEncodingDetector extends
       else
       {
         // Encoding detected as UTF-16LE
+        mark();
         return establishEncoding("UTF-16LE");
       }
       break;
@@ -163,17 +167,21 @@ public class BOMEncodingDetector extends
     case BOM_SEEN_FFFE00:
       if (theByte == 0x00)
       {
+        mark();
         return establishEncoding("UTF-32LE");
       }
       else
       {
-        resetToMark();
+        // Leave mark alone.
         return establishEncoding("UTF-16LE");
       }
 
     case BOM_SEEN_0000FE:
       if (theByte == 0xff)
+      {
+        mark();
         return establishEncoding("UTF-32BE");
+      }
       else
         return replay();
       
@@ -190,21 +198,14 @@ public class BOMEncodingDetector extends
     throws ManifoldCFException
   {
     this.encoding = encoding;
-    return playFromCurrentPoint();
+    return true;
   }
   
   /** Set a "mark".
   */
   protected void mark()
   {
-    // MHL
-  }
-  
-  /** Reset the "stream" to the last saved mark.
-  */
-  protected void resetToMark()
-  {
-    // MHL
+    replayBuffer.clear();
   }
   
   /** Establish NO encoding, and replay from the current saved point to the child, if any.
@@ -212,8 +213,7 @@ public class BOMEncodingDetector extends
   protected boolean replay()
     throws ManifoldCFException
   {
-    resetToMark();
-    return playFromCurrentPoint();
+    return true;
   }
   
   /** Send stream from current point onward with the current encoding.
@@ -221,8 +221,30 @@ public class BOMEncodingDetector extends
   protected boolean playFromCurrentPoint()
     throws ManifoldCFException
   {
-    // MHL
+    mark();
     return true;
   }
   
+  /** Deal with the remainder of the input.
+  * This is called only when dealWithByte() returns true.
+  *@param buffer is the buffer of characters that should come first.
+  *@param offset is the offset within the buffer of the first character.
+  *@param len is the number of characters in the buffer.
+  *@param inputStream is the stream that should come after the characters in the buffer.
+  *@return true to abort, false if the end of the stream has been reached.
+  */
+  @Override
+  protected boolean dealWithRemainder(byte[] buffer, int offset, int len, InputStream inputStream)
+    throws IOException, ManifoldCFException
+  {
+    if (overflowByteReceiver == null)
+      return super.dealWithRemainder(buffer,offset,len,inputStream);
+    // Create a wrapped input stream with all the missing bytes
+    while (len > 0)
+    {
+      replayBuffer.appendByte(buffer[offset++]);
+    }
+    return overflowByteReceiver.dealWithBytes(new PrefixedInputStream(replayBuffer,inputStream));
+  }
+
 }

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleByteReceiver.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleByteReceiver.java?rev=1443940&r1=1443939&r2=1443940&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleByteReceiver.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleByteReceiver.java Fri Feb  8 11:24:26 2013
@@ -48,7 +48,9 @@ public abstract class SingleByteReceiver
       for (int i = 0; i < amt; i++)
       {
         if (dealWithByte(byteBuffer[i]))
-          return true;
+        {
+          return dealWithRemainder(byteBuffer,i+1,amt-(i+1),inputStream);
+        }
       }
     }
   }
@@ -59,5 +61,18 @@ public abstract class SingleByteReceiver
   public abstract boolean dealWithByte(byte b)
     throws IOException, ManifoldCFException;
 
-
+  /** Deal with the remainder of the input.
+  * This is called only when dealWithByte() returns true.
+  *@param buffer is the buffer of characters that should come first.
+  *@param offset is the offset within the buffer of the first character.
+  *@param len is the number of characters in the buffer.
+  *@param inputStream is the stream that should come after the characters in the buffer.
+  *@return true to abort, false if the end of the stream has been reached.
+  */
+  protected boolean dealWithRemainder(byte[] buffer, int offset, int len, InputStream inputStream)
+    throws IOException, ManifoldCFException
+  {
+    return true;
+  }
+  
 }

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleCharacterReceiver.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleCharacterReceiver.java?rev=1443940&r1=1443939&r2=1443940&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleCharacterReceiver.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/SingleCharacterReceiver.java Fri Feb  8 11:24:26 2013
@@ -49,7 +49,9 @@ public abstract class SingleCharacterRec
       for (int i = 0; i < amt; i++)
       {
         if (dealWithCharacter(charBuffer[i]))
-          return true;
+        {
+          return dealWithRemainder(charBuffer, i+1, amt-(i+1), reader);
+        }
       }
     }
   }
@@ -60,4 +62,18 @@ public abstract class SingleCharacterRec
   public abstract boolean dealWithCharacter(char c)
     throws IOException, ManifoldCFException;
   
+  /** Deal with the remainder of the input.
+  * This is called only when dealWithCharacter() returns true.
+  *@param buffer is the buffer of characters that should come first.
+  *@param offset is the offset within the buffer of the first character.
+  *@param len is the number of characters in the buffer.
+  *@param inputStream is the stream that should come after the characters in the buffer.
+  *@return true to abort, false if the end of the stream has been reached.
+  */
+  protected boolean dealWithRemainder(char[] buffer, int offset, int len, Reader reader)
+    throws IOException, ManifoldCFException
+  {
+    return true;
+  }
+
 }