You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by no...@apache.org on 2011/06/06 21:11:06 UTC

svn commit: r1132728 - in /james/mailbox/trunk/store/src: main/java/org/apache/james/mailbox/store/search/SearchUtil.java test/java/org/apache/james/mailbox/store/search/ test/java/org/apache/james/mailbox/store/search/SearchUtilTest.java

Author: norman
Date: Mon Jun  6 19:11:06 2011
New Revision: 1132728

URL: http://svn.apache.org/viewvc?rev=1132728&view=rev
Log:
More work on base subject extracting. Still not complete... See MAILBOX-10

Added:
    james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/search/
    james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/search/SearchUtilTest.java
Modified:
    james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java

Modified: james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java?rev=1132728&r1=1132727&r2=1132728&view=diff
==============================================================================
--- james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java (original)
+++ james/mailbox/trunk/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java Mon Jun  6 19:11:06 2011
@@ -18,7 +18,7 @@
  ****************************************************************/
 package org.apache.james.mailbox.store.search;
 
-import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
 
 import org.apache.james.mime4j.codec.DecoderUtil;
 import org.apache.james.mime4j.util.MimeUtil;
@@ -26,49 +26,254 @@ import org.apache.james.mime4j.util.Mime
 public class SearchUtil {
 
     private final static String FWD_PARENS = "(fwd)";
+    private final static String SUBJ_FWD_HDR = "[fwd:";
+    private final static String SUBJ_FWD_TRL = "]";
+    private final static String RE = "re";
+    private final static String FWD = "FWD";
+    private final static String FW = "fw";
+    private final static char WS = ' ';
+    private final static char OPEN_SQUARE_BRACKED = '[';
+    private final static char CLOSE_SQUARE_BRACKED = ']';
+    private final static char COLON = ':';
     
+    private final static Charset UTF8 = Charset.forName("UTF8");
+
     /**
      * Extract the base subject from the given subject. 
      * 
      * See rfc5256 2.1 Base Subject
      * 
-     * TODO: FIX ME 
-     * 
+     * Subject sorting and threading use the "base subject", which has
+     * specific subject artifacts removed.  Due to the complexity of these
+     * artifacts, the formal syntax for the subject extraction rules is
+     * ambiguous.  The following procedure is followed to determine the
+     * "base subject", using the [ABNF] formal syntax rules described in
+     * section 5:
+     * <p>
+     *    (1) Convert any RFC 2047 encoded-words in the subject to [UTF-8]
+     *        as described in "Internationalization Considerations".
+     *        Convert all tabs and continuations to space.  Convert all
+     *        multiple spaces to a single space.
+     * </p>
+     * <p>
+     *    (2) Remove all trailing text of the subject that matches the
+     *        subj-trailer ABNF; repeat until no more matches are possible.
+     * </p>
+     * <p>
+     *    (3) Remove all prefix text of the subject that matches the subj-
+     *        leader ABNF.
+     * </p>
+     * <p>
+     *    (4) If there is prefix text of the subject that matches the subj-
+     *        blob ABNF, and removing that prefix leaves a non-empty subj-
+     *        base, then remove the prefix text.
+     * </p>
+     * <p>
+     *    (5) Repeat (3) and (4) until no matches remain.
+     * </p>
+     * Note: It is possible to defer step (2) until step (6), but this
+     * requires checking for subj-trailer in step (4).
+     * <br>
+     * <p>
+     *    (6) If the resulting text begins with the subj-fwd-hdr ABNF and
+     *        ends with the subj-fwd-trl ABNF, remove the subj-fwd-hdr and
+     *        subj-fwd-trl and repeat from step (2).
+     * </p>
+     * <p>
+     *    (7) The resulting text is the "base subject" used in the SORT.
+     * </p>
+     *
+     *
      * @param subject
      * @return baseSubject
      */
     public static String getBaseSubject(String subject) {
-        try {
-            String decodedSubject = new String(MimeUtil.unfold(DecoderUtil.decodeEncodedWords(subject)).getBytes("UTF-8"), "UTF-8");
+            
+            //   (1) Convert any RFC 2047 encoded-words in the subject to [UTF-8]
+            //    as described in "Internationalization Considerations".
+            //    Convert all tabs and continuations to space.  Convert all
+            //    multiple spaces to a single space.
+            String decodedSubject = MimeUtil.unfold(DecoderUtil.decodeEncodedWords(subject));
+            decodedSubject = new String(decodedSubject.getBytes(UTF8), UTF8);
+
             // replace all tabs with spaces and replace multiple spaces with one space
             decodedSubject = decodedSubject.replaceAll("\t", " ").replaceAll("( ){2,}", " ");
             
+            
             while (true) {
-                boolean changed = false;
-                if (decodedSubject.startsWith(FWD_PARENS)) {
-                    decodedSubject = decodedSubject.substring(FWD_PARENS.length(), decodedSubject.length());
-                    changed = true;
-                }
-                if (decodedSubject.startsWith(" ")) {
-                    // remove all leading spaces
-                    decodedSubject = decodedSubject.replaceAll("^( )+", "");
-                    changed = true;
+                int decodedSubjectLength = decodedSubject.length();
+                while (true) {
+                    //    (2) Remove all trailing text of the subject that matches the
+                    //    subj-trailer ABNF; repeat until no more matches are possible.
+                    String subj = removeSubTrailers(decodedSubject);
+                    if (decodedSubjectLength > subj.length()) {
+                        decodedSubject = subj;
+                        decodedSubjectLength = decodedSubject.length();
+                    } else {
+                        break;
+                    }
+
                 }
-                int length = decodedSubject.length();
-                if (!changed) {
-                    changed = length != decodedSubject.length();
+                
+                while (true) {
+                    boolean matchedInner = false;
+
+                    //    (3) Remove all prefix text of the subject that matches the subj-
+                    //    leader ABNF.
+                    decodedSubjectLength = decodedSubject.length();
+                    decodedSubject = removeSubjLeaders(decodedSubject);
+                    if (decodedSubjectLength > decodedSubject.length()) {
+                        matchedInner = true;
+                        decodedSubjectLength = decodedSubject.length();
+
+                    }
+
+                    //    (4) If there is prefix text of the subject that matches the subj-
+                    //    blob ABNF, and removing that prefix leaves a non-empty subj-
+                    //    base, then remove the prefix text.
+                    decodedSubjectLength = decodedSubject.length();
+                    String subj = removeBlob(decodedSubject);
+                    
+                    // check if it will leave a non-empty subject
+                    if (subj.length() > 0) {
+                        decodedSubject = subj;
+                        if (decodedSubjectLength > decodedSubject.length()) {
+                            matchedInner = true;
+                            decodedSubjectLength = decodedSubject.length();
+
+                        }
+
+                    }
+                    // (5) Repeat (3) and (4) until no matches remain.
+                    if (!matchedInner) {
+                        // no more matches so break the loop 
+                        break;
+                    } 
                 }
                 
-                if(!changed) {
+                if (decodedSubject.startsWith(SUBJ_FWD_HDR) && decodedSubject.endsWith(SUBJ_FWD_TRL)) {
+                    //    (6) If the resulting text begins with the subj-fwd-hdr ABNF and
+                    //    ends with the subj-fwd-trl ABNF, remove the subj-fwd-hdr and
+                    //    subj-fwd-trl and repeat from step (2).
+                    decodedSubject = decodedSubject.substring(SUBJ_FWD_HDR.length(), decodedSubject.length() - SUBJ_FWD_TRL.length());
+                    decodedSubjectLength = decodedSubject.length();
+
+                } else {
                     break;
                 }
+               
             }
+            // (7) The resulting text is the "base subject" used in the SORT.
             return decodedSubject;
-        } catch (UnsupportedEncodingException e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
+    }
+ 
+    private static String removeSubjectBlob(String subject) {
+        String subj = subject;
+        while(subj.charAt(0) == OPEN_SQUARE_BRACKED) {
+            int length = subj.length();
+            subj = removeBlob(subject);
+            int i = 0;
+            if (subj.charAt(i) == CLOSE_SQUARE_BRACKED) {
+                i++;
+            } else {
+                return subject;
+            }
+            while (subj.charAt(i) == WS) {
+                i++;
+            }
+            subj = subj.substring(i);
+            System.out.println(subj);
+
+            if (length == subj.length()) {
+                return subj;
+            }
+        }
+        return subj;
+    }
+    private static String removeSubjLeaders(String subject) {
+        
+        // subj-leader     = (*subj-blob subj-refwd) / WSP
+        // subj-blob       = "[" *BLOBCHAR "]" *WSP
+        // subj-refwd      = ("re" / ("fw" ["d"])) *WSP [subj-blob] ":"
+        //
+        // BLOBCHAR        = %x01-5a / %x5c / %x5e-7f
+        //                ; any CHAR except '[' and ']' */
+        
+        String subj = removeSubjectBlob(subject);
+        
+        
+        int subString = 0;
+        if (subj.startsWith(RE)) {
+            subString = RE.length();
+        } else if (subj.startsWith(FWD)) {
+            subString = FWD.length();
+        } else if (subj.startsWith(FW)) {
+            subString  = FW.length();
+        } else {
+            return subject;
+        } 
+        while(subj.charAt(subString) == WS) {
+            subString++;
+        }
+        subj = removeSubjectBlob(subj.substring(subString));
+        if (subj.endsWith(String.valueOf(CLOSE_SQUARE_BRACKED))) {
+            subString = 1;
+        } else {
+            subString = 0;
+        }
+
+        if (subj.charAt(subString) == COLON) {
+            subString++;
+        } else {
+            return subject;
+        }
+        
+        return subj.substring(subString);
+    }
+    
+    private static String removeSubTrailers(String decodedSubject) {
+        int subStringStart = 0;
+        int subStringEnd = decodedSubject.length();
+        
+        // remove the remove_subj_trailers
+        //
+        // subj-trailer    = "(fwd)" / WSP
+        int originalSize = decodedSubject.length();
+        int curPos = originalSize -1;
+        while(true) {
+            char c = decodedSubject.charAt(curPos--);
+            if (c == WS) {
+                subStringEnd--;
+            } else {
+                if (subStringEnd > FWD_PARENS.length() && decodedSubject.endsWith(FWD_PARENS)) {
+                    subStringEnd -= FWD_PARENS.length();
+                } 
+                break;
+            }
         }
-        return null;
+        decodedSubject = decodedSubject.substring(subStringStart, subStringEnd);
+        return decodedSubject;
     }
     
+    private static String removeBlob(String subject) {
+        int i = 0;
+        char lastChar = Character.UNASSIGNED;
+        for (int a = 0; a < subject.length(); a++) {
+            char c = subject.charAt(a);
+            lastChar = c;
+            if (( a != 0  && c == OPEN_SQUARE_BRACKED) || c == CLOSE_SQUARE_BRACKED) {
+                break;
+            }
+            i++;
+        }
+
+        if (lastChar != CLOSE_SQUARE_BRACKED) {
+            return subject;
+        } else {
+            return subject.substring(i);
+        }
+
+    }
+
+
 }

Added: james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/search/SearchUtilTest.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/search/SearchUtilTest.java?rev=1132728&view=auto
==============================================================================
--- james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/search/SearchUtilTest.java (added)
+++ james/mailbox/trunk/store/src/test/java/org/apache/james/mailbox/store/search/SearchUtilTest.java Mon Jun  6 19:11:06 2011
@@ -0,0 +1,41 @@
+package org.apache.james.mailbox.store.search;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+public class SearchUtilTest {
+
+    @Test
+    public void testSimpleSubject() {
+        String subject ="This is my subject";
+        assertEquals(subject, SearchUtil.getBaseSubject(subject));
+    }
+    
+    @Test
+    public void testReplaceSpacesAndTabsInSubject() {
+        String subject ="This   is my\tsubject";
+        assertEquals("This is my subject", SearchUtil.getBaseSubject(subject));
+    }
+    
+    @Test
+    public void testRemoveTrailingSpace() {
+        String subject ="This is my subject ";
+        assertEquals("This is my subject", SearchUtil.getBaseSubject(subject));
+    }
+    
+    
+    @Test
+    public void testRemoveTrailingFwd() {
+        String subject ="This is my subject (fwd)";
+        assertEquals("This is my subject", SearchUtil.getBaseSubject(subject));
+    }
+    
+    /*
+    @Test
+    public void testRemoveLeaders() {
+        String subject ="[Blah blub] [go] re: This is my subject";
+        assertEquals("This is my subject", SearchUtil.getBaseSubject(subject));
+    }
+    */
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org