You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@openoffice.apache.org by da...@apache.org on 2016/04/17 18:44:43 UTC

svn commit: r1739628 - in /openoffice/trunk/main: connectivity/source/drivers/flat/ETable.cxx tools/source/stream/stream.cxx

Author: damjan
Date: Sun Apr 17 16:44:43 2016
New Revision: 1739628

URL: http://svn.apache.org/viewvc?rev=1739628&view=rev
Log:
Make CSV line parsers consistent with CSV field parsers.

Our CSV field parsing algorithms treats fields starting with a quote
(immediately at the beginning of the row, or after the field delimiter) as
quoted. A quoted field ends at the corresponding closing quote, and any
remaining text between the closing quote and the next field delimeter or end
of line is appended to the text already extracted from the field, but not
processed further. Any quotes in this extra text are taken verbatim - they
do not quote anything.

Our CSV line parsers were big hacks - they essentially read and concatenate
lines until an even number of quote characters is found, and then feed this
through the CSV field parsers.

This patch rewrites the line parsers to work exactly how the field parsers
work. Text such as:
"another" ",something else
is now correctly parsed by both Calc and Base as:
[another "],[something else]
instead of breaking all further parsing.

Patch by: me


Modified:
    openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
    openoffice/trunk/main/tools/source/stream/stream.cxx

Modified: openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
URL: http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
==============================================================================
--- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx (original)
+++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx Sun Apr 17 16:44:43 2016
@@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke
         return sal_False;
 
     QuotedTokenizedString sLine = line; // check if the string continues on next line
-    while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) != 1 )
+    xub_StrLen nLastOffset = 0;
+    bool isQuoted = false;
+    bool isFieldStarting = true;
+    while (true)
     {
-        m_pFileStream->ReadByteStringLine(sLine,nEncoding);
-        if ( !m_pFileStream->IsEof() )
+        bool wasQuote = false;
+        const sal_Unicode *p;
+        p = sLine.GetString().GetBuffer();
+        p += nLastOffset;
+
+        while (*p)
+        {
+            if (isQuoted)
+            {
+                if (*p == m_cStringDelimiter)
+                    wasQuote = !wasQuote;
+                else
+                {
+                    if (wasQuote)
+                    {
+                        wasQuote = false;
+                        isQuoted = false;
+                        if (*p == m_cFieldDelimiter)
+                            isFieldStarting = true;
+                    }
+                }
+            }
+            else
+            {
+                if (isFieldStarting)
+                {
+                    isFieldStarting = false;
+                    if (*p == m_cStringDelimiter)
+                        isQuoted = true;
+                    else if (*p == m_cFieldDelimiter)
+                        isFieldStarting = true;
+                }
+                else if (*p == m_cFieldDelimiter)
+                    isFieldStarting = true;
+            }
+            ++p;
+        }
+
+        if (wasQuote)
+            isQuoted = false;
+
+        if (isQuoted)
         {
-            line.GetString().Append('\n');
-            line.GetString() += sLine.GetString();
-            sLine = line;
+            nLastOffset = sLine.Len();
+            m_pFileStream->ReadByteStringLine(sLine,nEncoding);
+            if ( !m_pFileStream->IsEof() )
+            {
+                line.GetString().Append('\n');
+                line.GetString() += sLine.GetString();
+                sLine = line;
+            }
+            else
+                break;
         }
         else
             break;

Modified: openoffice/trunk/main/tools/source/stream/stream.cxx
URL: http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/stream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
==============================================================================
--- openoffice/trunk/main/tools/source/stream/stream.cxx (original)
+++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17 16:44:43 2016
@@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String&
     {
         const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
         xub_StrLen nLastOffset = 0;
-        xub_StrLen nQuotes = 0;
+        bool isQuoted = false;
+        bool isFieldStarting = true;
         while (!IsEof() && rStr.Len() < STRING_MAXLEN)
         {
+            bool wasQuote = false;
             bool bBackslashEscaped = false;
-            const sal_Unicode *p, *pStart;
-            p = pStart = rStr.GetBuffer();
+            const sal_Unicode *p;
+            p = rStr.GetBuffer();
             p += nLastOffset;
             while (*p)
             {
-                if (nQuotes)
+                if (isQuoted)
                 {
                     if (*p == cFieldQuote && !bBackslashEscaped)
-                        ++nQuotes;
-                    else if (bAllowBackslashEscape)
+                        wasQuote = !wasQuote;
+                    else
                     {
-                        if (*p == '\\')
-                            bBackslashEscaped = !bBackslashEscaped;
-                        else
-                            bBackslashEscaped = false;
+                        if (bAllowBackslashEscape)
+                        {
+                            if (*p == '\\')
+                                bBackslashEscaped = !bBackslashEscaped;
+                            else
+                                bBackslashEscaped = false;
+                        }
+                        if (wasQuote)
+                        {
+                            wasQuote = false;
+                            isQuoted = false;
+                            if (lcl_UnicodeStrChr( pSeps, *p ))
+                                isFieldStarting = true;
+                        }
                     }
                 }
-                else if (*p == cFieldQuote && (p == pStart ||
-                            lcl_UnicodeStrChr( pSeps, p[-1])))
-                    nQuotes = 1;
-                // A quote character inside a field content does not start
-                // a quote.
+                else
+                {
+                    if (isFieldStarting)
+                    {
+                        isFieldStarting = false;
+                        if (*p == cFieldQuote)
+                            isQuoted = true;
+                        else if (lcl_UnicodeStrChr( pSeps, *p ))
+                            isFieldStarting = true;
+                    }
+                    else if (lcl_UnicodeStrChr( pSeps, *p ))
+                        isFieldStarting = true;
+                }
                 ++p;
             }
 
-            if (nQuotes % 2 == 0)
-                break;
-            else
+            if (wasQuote)
+                isQuoted = false;
+
+            if (isQuoted)
             {
                 nLastOffset = rStr.Len();
                 String aNext;
@@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String&
                 rStr += sal_Unicode(_LF);
                 rStr += aNext;
             }
+            else
+                break;
         }
     }
     return nError == SVSTREAM_OK;



Re: svn commit: r1739628 - in /openoffice/trunk/main: connectivity/source/drivers/flat/ETable.cxx tools/source/stream/stream.cxx

Posted by Kay Schenk <ka...@gmail.com>.
Super! I will test it out soonish -- maybe next week.

On Mon, Apr 18, 2016 at 10:39 AM, Damjan Jovanovic <da...@apache.org>
wrote:

> Furthermore, the new behaviour both fixes #126805 and matches Excel's
> behaviour on the same tests, so I am very happy.
>
> On Mon, Apr 18, 2016 at 2:05 AM, Damjan Jovanovic <da...@apache.org>
> wrote:
> > The way the CSV field parsers in both Calc and Base work is that a
> > quoted field is only quoted up until the earliest matching quote
> > character that has no adjacent quote to escape it. The text after it,
> > and until the field separator, is unquoted. See
> > QuotedTokenizedString::GetTokenSpecial() in
> > main/connectivity/source/drivers/file/quotedstring.cxx for Base, and
> > ScImportExport::ScanNextFieldFromString() in
> > main/sc/source/ui/docshell/impex.cxx for Calc in which a comment calls
> > this "Append remaining unquoted and undelimited data (dirty, dirty) to
> > this field".
> >
> > "abc"d is parsed as [abcd], and "another " " as [another "]. It's not
> > clear why this was done, but it is clear that it was done
> > intentionally.
> >
> > Damjan
> >
> > On Sun, Apr 17, 2016 at 10:34 PM, Dennis E. Hamilton
> > <de...@acm.org> wrote:
> >> Does the rule about using "" to make a single quote inside a quoted
> field also apply?
> >>
> >>  - Dennis
> >>
> >>> -----Original Message-----
> >>> From: damjan@apache.org [mailto:damjan@apache.org]
> >>> Sent: Sunday, April 17, 2016 09:45
> >>> To: commits@openoffice.apache.org
> >>> Subject: svn commit: r1739628 - in /openoffice/trunk/main:
> >>> connectivity/source/drivers/flat/ETable.cxx
> >>> tools/source/stream/stream.cxx
> >>>
> >>> Author: damjan
> >>> Date: Sun Apr 17 16:44:43 2016
> >>> New Revision: 1739628
> >>>
> >>> URL: http://svn.apache.org/viewvc?rev=1739628&view=rev
> >>> Log:
> >>> Make CSV line parsers consistent with CSV field parsers.
> >>>
> >>> Our CSV field parsing algorithms treats fields starting with a quote
> >>> (immediately at the beginning of the row, or after the field delimiter)
> >>> as
> >>> quoted. A quoted field ends at the corresponding closing quote, and any
> >>> remaining text between the closing quote and the next field delimeter
> or
> >>> end
> >>> of line is appended to the text already extracted from the field, but
> >>> not
> >>> processed further. Any quotes in this extra text are taken verbatim -
> >>> they
> >>> do not quote anything.
> >>>
> >>> Our CSV line parsers were big hacks - they essentially read and
> >>> concatenate
> >>> lines until an even number of quote characters is found, and then feed
> >>> this
> >>> through the CSV field parsers.
> >>>
> >>> This patch rewrites the line parsers to work exactly how the field
> >>> parsers
> >>> work. Text such as:
> >>> "another" ",something else
> >>> is now correctly parsed by both Calc and Base as:
> >>> [another "],[something else]
> >>> instead of breaking all further parsing.
> >>>
> >>> Patch by: me
> >>>
> >>>
> >>> Modified:
> >>>     openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> >>>     openoffice/trunk/main/tools/source/stream/stream.cxx
> >>>
> >>> Modified:
> >>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> >>> URL:
> >>>
> http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/d
> >>> rivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
> >>>
> ========================================================================
> >>> ======
> >>> --- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> >>> (original)
> >>> +++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> >>> Sun Apr 17 16:44:43 2016
> >>> @@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke
> >>>          return sal_False;
> >>>
> >>>      QuotedTokenizedString sLine = line; // check if the string
> >>> continues on next line
> >>> -    while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2)
> !=
> >>> 1 )
> >>> +    xub_StrLen nLastOffset = 0;
> >>> +    bool isQuoted = false;
> >>> +    bool isFieldStarting = true;
> >>> +    while (true)
> >>>      {
> >>> -        m_pFileStream->ReadByteStringLine(sLine,nEncoding);
> >>> -        if ( !m_pFileStream->IsEof() )
> >>> +        bool wasQuote = false;
> >>> +        const sal_Unicode *p;
> >>> +        p = sLine.GetString().GetBuffer();
> >>> +        p += nLastOffset;
> >>> +
> >>> +        while (*p)
> >>> +        {
> >>> +            if (isQuoted)
> >>> +            {
> >>> +                if (*p == m_cStringDelimiter)
> >>> +                    wasQuote = !wasQuote;
> >>> +                else
> >>> +                {
> >>> +                    if (wasQuote)
> >>> +                    {
> >>> +                        wasQuote = false;
> >>> +                        isQuoted = false;
> >>> +                        if (*p == m_cFieldDelimiter)
> >>> +                            isFieldStarting = true;
> >>> +                    }
> >>> +                }
> >>> +            }
> >>> +            else
> >>> +            {
> >>> +                if (isFieldStarting)
> >>> +                {
> >>> +                    isFieldStarting = false;
> >>> +                    if (*p == m_cStringDelimiter)
> >>> +                        isQuoted = true;
> >>> +                    else if (*p == m_cFieldDelimiter)
> >>> +                        isFieldStarting = true;
> >>> +                }
> >>> +                else if (*p == m_cFieldDelimiter)
> >>> +                    isFieldStarting = true;
> >>> +            }
> >>> +            ++p;
> >>> +        }
> >>> +
> >>> +        if (wasQuote)
> >>> +            isQuoted = false;
> >>> +
> >>> +        if (isQuoted)
> >>>          {
> >>> -            line.GetString().Append('\n');
> >>> -            line.GetString() += sLine.GetString();
> >>> -            sLine = line;
> >>> +            nLastOffset = sLine.Len();
> >>> +            m_pFileStream->ReadByteStringLine(sLine,nEncoding);
> >>> +            if ( !m_pFileStream->IsEof() )
> >>> +            {
> >>> +                line.GetString().Append('\n');
> >>> +                line.GetString() += sLine.GetString();
> >>> +                sLine = line;
> >>> +            }
> >>> +            else
> >>> +                break;
> >>>          }
> >>>          else
> >>>              break;
> >>>
> >>> Modified: openoffice/trunk/main/tools/source/stream/stream.cxx
> >>> URL:
> >>>
> http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/s
> >>> tream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
> >>>
> ========================================================================
> >>> ======
> >>> --- openoffice/trunk/main/tools/source/stream/stream.cxx (original)
> >>> +++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17
> >>> 16:44:43 2016
> >>> @@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String&
> >>>      {
> >>>          const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
> >>>          xub_StrLen nLastOffset = 0;
> >>> -        xub_StrLen nQuotes = 0;
> >>> +        bool isQuoted = false;
> >>> +        bool isFieldStarting = true;
> >>>          while (!IsEof() && rStr.Len() < STRING_MAXLEN)
> >>>          {
> >>> +            bool wasQuote = false;
> >>>              bool bBackslashEscaped = false;
> >>> -            const sal_Unicode *p, *pStart;
> >>> -            p = pStart = rStr.GetBuffer();
> >>> +            const sal_Unicode *p;
> >>> +            p = rStr.GetBuffer();
> >>>              p += nLastOffset;
> >>>              while (*p)
> >>>              {
> >>> -                if (nQuotes)
> >>> +                if (isQuoted)
> >>>                  {
> >>>                      if (*p == cFieldQuote && !bBackslashEscaped)
> >>> -                        ++nQuotes;
> >>> -                    else if (bAllowBackslashEscape)
> >>> +                        wasQuote = !wasQuote;
> >>> +                    else
> >>>                      {
> >>> -                        if (*p == '\\')
> >>> -                            bBackslashEscaped = !bBackslashEscaped;
> >>> -                        else
> >>> -                            bBackslashEscaped = false;
> >>> +                        if (bAllowBackslashEscape)
> >>> +                        {
> >>> +                            if (*p == '\\')
> >>> +                                bBackslashEscaped =
> !bBackslashEscaped;
> >>> +                            else
> >>> +                                bBackslashEscaped = false;
> >>> +                        }
> >>> +                        if (wasQuote)
> >>> +                        {
> >>> +                            wasQuote = false;
> >>> +                            isQuoted = false;
> >>> +                            if (lcl_UnicodeStrChr( pSeps, *p ))
> >>> +                                isFieldStarting = true;
> >>> +                        }
> >>>                      }
> >>>                  }
> >>> -                else if (*p == cFieldQuote && (p == pStart ||
> >>> -                            lcl_UnicodeStrChr( pSeps, p[-1])))
> >>> -                    nQuotes = 1;
> >>> -                // A quote character inside a field content does not
> >>> start
> >>> -                // a quote.
> >>> +                else
> >>> +                {
> >>> +                    if (isFieldStarting)
> >>> +                    {
> >>> +                        isFieldStarting = false;
> >>> +                        if (*p == cFieldQuote)
> >>> +                            isQuoted = true;
> >>> +                        else if (lcl_UnicodeStrChr( pSeps, *p ))
> >>> +                            isFieldStarting = true;
> >>> +                    }
> >>> +                    else if (lcl_UnicodeStrChr( pSeps, *p ))
> >>> +                        isFieldStarting = true;
> >>> +                }
> >>>                  ++p;
> >>>              }
> >>>
> >>> -            if (nQuotes % 2 == 0)
> >>> -                break;
> >>> -            else
> >>> +            if (wasQuote)
> >>> +                isQuoted = false;
> >>> +
> >>> +            if (isQuoted)
> >>>              {
> >>>                  nLastOffset = rStr.Len();
> >>>                  String aNext;
> >>> @@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String&
> >>>                  rStr += sal_Unicode(_LF);
> >>>                  rStr += aNext;
> >>>              }
> >>> +            else
> >>> +                break;
> >>>          }
> >>>      }
> >>>      return nError == SVSTREAM_OK;
> >>
> >>
> >>
> >> ---------------------------------------------------------------------
> >> To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
> >> For additional commands, e-mail: dev-help@openoffice.apache.org
> >>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
> For additional commands, e-mail: dev-help@openoffice.apache.org
>
>


-- 
----------------------------------------------------------------------
MzK

"Time spent with cats is never wasted."
                                -- Sigmund Freud

Re: svn commit: r1739628 - in /openoffice/trunk/main: connectivity/source/drivers/flat/ETable.cxx tools/source/stream/stream.cxx

Posted by Damjan Jovanovic <da...@apache.org>.
Furthermore, the new behaviour both fixes #126805 and matches Excel's
behaviour on the same tests, so I am very happy.

On Mon, Apr 18, 2016 at 2:05 AM, Damjan Jovanovic <da...@apache.org> wrote:
> The way the CSV field parsers in both Calc and Base work is that a
> quoted field is only quoted up until the earliest matching quote
> character that has no adjacent quote to escape it. The text after it,
> and until the field separator, is unquoted. See
> QuotedTokenizedString::GetTokenSpecial() in
> main/connectivity/source/drivers/file/quotedstring.cxx for Base, and
> ScImportExport::ScanNextFieldFromString() in
> main/sc/source/ui/docshell/impex.cxx for Calc in which a comment calls
> this "Append remaining unquoted and undelimited data (dirty, dirty) to
> this field".
>
> "abc"d is parsed as [abcd], and "another " " as [another "]. It's not
> clear why this was done, but it is clear that it was done
> intentionally.
>
> Damjan
>
> On Sun, Apr 17, 2016 at 10:34 PM, Dennis E. Hamilton
> <de...@acm.org> wrote:
>> Does the rule about using "" to make a single quote inside a quoted field also apply?
>>
>>  - Dennis
>>
>>> -----Original Message-----
>>> From: damjan@apache.org [mailto:damjan@apache.org]
>>> Sent: Sunday, April 17, 2016 09:45
>>> To: commits@openoffice.apache.org
>>> Subject: svn commit: r1739628 - in /openoffice/trunk/main:
>>> connectivity/source/drivers/flat/ETable.cxx
>>> tools/source/stream/stream.cxx
>>>
>>> Author: damjan
>>> Date: Sun Apr 17 16:44:43 2016
>>> New Revision: 1739628
>>>
>>> URL: http://svn.apache.org/viewvc?rev=1739628&view=rev
>>> Log:
>>> Make CSV line parsers consistent with CSV field parsers.
>>>
>>> Our CSV field parsing algorithms treats fields starting with a quote
>>> (immediately at the beginning of the row, or after the field delimiter)
>>> as
>>> quoted. A quoted field ends at the corresponding closing quote, and any
>>> remaining text between the closing quote and the next field delimeter or
>>> end
>>> of line is appended to the text already extracted from the field, but
>>> not
>>> processed further. Any quotes in this extra text are taken verbatim -
>>> they
>>> do not quote anything.
>>>
>>> Our CSV line parsers were big hacks - they essentially read and
>>> concatenate
>>> lines until an even number of quote characters is found, and then feed
>>> this
>>> through the CSV field parsers.
>>>
>>> This patch rewrites the line parsers to work exactly how the field
>>> parsers
>>> work. Text such as:
>>> "another" ",something else
>>> is now correctly parsed by both Calc and Base as:
>>> [another "],[something else]
>>> instead of breaking all further parsing.
>>>
>>> Patch by: me
>>>
>>>
>>> Modified:
>>>     openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>>>     openoffice/trunk/main/tools/source/stream/stream.cxx
>>>
>>> Modified:
>>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>>> URL:
>>> http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/d
>>> rivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
>>> ========================================================================
>>> ======
>>> --- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>>> (original)
>>> +++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>>> Sun Apr 17 16:44:43 2016
>>> @@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke
>>>          return sal_False;
>>>
>>>      QuotedTokenizedString sLine = line; // check if the string
>>> continues on next line
>>> -    while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) !=
>>> 1 )
>>> +    xub_StrLen nLastOffset = 0;
>>> +    bool isQuoted = false;
>>> +    bool isFieldStarting = true;
>>> +    while (true)
>>>      {
>>> -        m_pFileStream->ReadByteStringLine(sLine,nEncoding);
>>> -        if ( !m_pFileStream->IsEof() )
>>> +        bool wasQuote = false;
>>> +        const sal_Unicode *p;
>>> +        p = sLine.GetString().GetBuffer();
>>> +        p += nLastOffset;
>>> +
>>> +        while (*p)
>>> +        {
>>> +            if (isQuoted)
>>> +            {
>>> +                if (*p == m_cStringDelimiter)
>>> +                    wasQuote = !wasQuote;
>>> +                else
>>> +                {
>>> +                    if (wasQuote)
>>> +                    {
>>> +                        wasQuote = false;
>>> +                        isQuoted = false;
>>> +                        if (*p == m_cFieldDelimiter)
>>> +                            isFieldStarting = true;
>>> +                    }
>>> +                }
>>> +            }
>>> +            else
>>> +            {
>>> +                if (isFieldStarting)
>>> +                {
>>> +                    isFieldStarting = false;
>>> +                    if (*p == m_cStringDelimiter)
>>> +                        isQuoted = true;
>>> +                    else if (*p == m_cFieldDelimiter)
>>> +                        isFieldStarting = true;
>>> +                }
>>> +                else if (*p == m_cFieldDelimiter)
>>> +                    isFieldStarting = true;
>>> +            }
>>> +            ++p;
>>> +        }
>>> +
>>> +        if (wasQuote)
>>> +            isQuoted = false;
>>> +
>>> +        if (isQuoted)
>>>          {
>>> -            line.GetString().Append('\n');
>>> -            line.GetString() += sLine.GetString();
>>> -            sLine = line;
>>> +            nLastOffset = sLine.Len();
>>> +            m_pFileStream->ReadByteStringLine(sLine,nEncoding);
>>> +            if ( !m_pFileStream->IsEof() )
>>> +            {
>>> +                line.GetString().Append('\n');
>>> +                line.GetString() += sLine.GetString();
>>> +                sLine = line;
>>> +            }
>>> +            else
>>> +                break;
>>>          }
>>>          else
>>>              break;
>>>
>>> Modified: openoffice/trunk/main/tools/source/stream/stream.cxx
>>> URL:
>>> http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/s
>>> tream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
>>> ========================================================================
>>> ======
>>> --- openoffice/trunk/main/tools/source/stream/stream.cxx (original)
>>> +++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17
>>> 16:44:43 2016
>>> @@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String&
>>>      {
>>>          const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
>>>          xub_StrLen nLastOffset = 0;
>>> -        xub_StrLen nQuotes = 0;
>>> +        bool isQuoted = false;
>>> +        bool isFieldStarting = true;
>>>          while (!IsEof() && rStr.Len() < STRING_MAXLEN)
>>>          {
>>> +            bool wasQuote = false;
>>>              bool bBackslashEscaped = false;
>>> -            const sal_Unicode *p, *pStart;
>>> -            p = pStart = rStr.GetBuffer();
>>> +            const sal_Unicode *p;
>>> +            p = rStr.GetBuffer();
>>>              p += nLastOffset;
>>>              while (*p)
>>>              {
>>> -                if (nQuotes)
>>> +                if (isQuoted)
>>>                  {
>>>                      if (*p == cFieldQuote && !bBackslashEscaped)
>>> -                        ++nQuotes;
>>> -                    else if (bAllowBackslashEscape)
>>> +                        wasQuote = !wasQuote;
>>> +                    else
>>>                      {
>>> -                        if (*p == '\\')
>>> -                            bBackslashEscaped = !bBackslashEscaped;
>>> -                        else
>>> -                            bBackslashEscaped = false;
>>> +                        if (bAllowBackslashEscape)
>>> +                        {
>>> +                            if (*p == '\\')
>>> +                                bBackslashEscaped = !bBackslashEscaped;
>>> +                            else
>>> +                                bBackslashEscaped = false;
>>> +                        }
>>> +                        if (wasQuote)
>>> +                        {
>>> +                            wasQuote = false;
>>> +                            isQuoted = false;
>>> +                            if (lcl_UnicodeStrChr( pSeps, *p ))
>>> +                                isFieldStarting = true;
>>> +                        }
>>>                      }
>>>                  }
>>> -                else if (*p == cFieldQuote && (p == pStart ||
>>> -                            lcl_UnicodeStrChr( pSeps, p[-1])))
>>> -                    nQuotes = 1;
>>> -                // A quote character inside a field content does not
>>> start
>>> -                // a quote.
>>> +                else
>>> +                {
>>> +                    if (isFieldStarting)
>>> +                    {
>>> +                        isFieldStarting = false;
>>> +                        if (*p == cFieldQuote)
>>> +                            isQuoted = true;
>>> +                        else if (lcl_UnicodeStrChr( pSeps, *p ))
>>> +                            isFieldStarting = true;
>>> +                    }
>>> +                    else if (lcl_UnicodeStrChr( pSeps, *p ))
>>> +                        isFieldStarting = true;
>>> +                }
>>>                  ++p;
>>>              }
>>>
>>> -            if (nQuotes % 2 == 0)
>>> -                break;
>>> -            else
>>> +            if (wasQuote)
>>> +                isQuoted = false;
>>> +
>>> +            if (isQuoted)
>>>              {
>>>                  nLastOffset = rStr.Len();
>>>                  String aNext;
>>> @@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String&
>>>                  rStr += sal_Unicode(_LF);
>>>                  rStr += aNext;
>>>              }
>>> +            else
>>> +                break;
>>>          }
>>>      }
>>>      return nError == SVSTREAM_OK;
>>
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
>> For additional commands, e-mail: dev-help@openoffice.apache.org
>>

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
For additional commands, e-mail: dev-help@openoffice.apache.org


Re: svn commit: r1739628 - in /openoffice/trunk/main: connectivity/source/drivers/flat/ETable.cxx tools/source/stream/stream.cxx

Posted by Damjan Jovanovic <da...@apache.org>.
The way the CSV field parsers in both Calc and Base work is that a
quoted field is only quoted up until the earliest matching quote
character that has no adjacent quote to escape it. The text after it,
and until the field separator, is unquoted. See
QuotedTokenizedString::GetTokenSpecial() in
main/connectivity/source/drivers/file/quotedstring.cxx for Base, and
ScImportExport::ScanNextFieldFromString() in
main/sc/source/ui/docshell/impex.cxx for Calc in which a comment calls
this "Append remaining unquoted and undelimited data (dirty, dirty) to
this field".

"abc"d is parsed as [abcd], and "another " " as [another "]. It's not
clear why this was done, but it is clear that it was done
intentionally.

Damjan

On Sun, Apr 17, 2016 at 10:34 PM, Dennis E. Hamilton
<de...@acm.org> wrote:
> Does the rule about using "" to make a single quote inside a quoted field also apply?
>
>  - Dennis
>
>> -----Original Message-----
>> From: damjan@apache.org [mailto:damjan@apache.org]
>> Sent: Sunday, April 17, 2016 09:45
>> To: commits@openoffice.apache.org
>> Subject: svn commit: r1739628 - in /openoffice/trunk/main:
>> connectivity/source/drivers/flat/ETable.cxx
>> tools/source/stream/stream.cxx
>>
>> Author: damjan
>> Date: Sun Apr 17 16:44:43 2016
>> New Revision: 1739628
>>
>> URL: http://svn.apache.org/viewvc?rev=1739628&view=rev
>> Log:
>> Make CSV line parsers consistent with CSV field parsers.
>>
>> Our CSV field parsing algorithms treats fields starting with a quote
>> (immediately at the beginning of the row, or after the field delimiter)
>> as
>> quoted. A quoted field ends at the corresponding closing quote, and any
>> remaining text between the closing quote and the next field delimeter or
>> end
>> of line is appended to the text already extracted from the field, but
>> not
>> processed further. Any quotes in this extra text are taken verbatim -
>> they
>> do not quote anything.
>>
>> Our CSV line parsers were big hacks - they essentially read and
>> concatenate
>> lines until an even number of quote characters is found, and then feed
>> this
>> through the CSV field parsers.
>>
>> This patch rewrites the line parsers to work exactly how the field
>> parsers
>> work. Text such as:
>> "another" ",something else
>> is now correctly parsed by both Calc and Base as:
>> [another "],[something else]
>> instead of breaking all further parsing.
>>
>> Patch by: me
>>
>>
>> Modified:
>>     openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>>     openoffice/trunk/main/tools/source/stream/stream.cxx
>>
>> Modified:
>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>> URL:
>> http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/d
>> rivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
>> ========================================================================
>> ======
>> --- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>> (original)
>> +++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>> Sun Apr 17 16:44:43 2016
>> @@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke
>>          return sal_False;
>>
>>      QuotedTokenizedString sLine = line; // check if the string
>> continues on next line
>> -    while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) !=
>> 1 )
>> +    xub_StrLen nLastOffset = 0;
>> +    bool isQuoted = false;
>> +    bool isFieldStarting = true;
>> +    while (true)
>>      {
>> -        m_pFileStream->ReadByteStringLine(sLine,nEncoding);
>> -        if ( !m_pFileStream->IsEof() )
>> +        bool wasQuote = false;
>> +        const sal_Unicode *p;
>> +        p = sLine.GetString().GetBuffer();
>> +        p += nLastOffset;
>> +
>> +        while (*p)
>> +        {
>> +            if (isQuoted)
>> +            {
>> +                if (*p == m_cStringDelimiter)
>> +                    wasQuote = !wasQuote;
>> +                else
>> +                {
>> +                    if (wasQuote)
>> +                    {
>> +                        wasQuote = false;
>> +                        isQuoted = false;
>> +                        if (*p == m_cFieldDelimiter)
>> +                            isFieldStarting = true;
>> +                    }
>> +                }
>> +            }
>> +            else
>> +            {
>> +                if (isFieldStarting)
>> +                {
>> +                    isFieldStarting = false;
>> +                    if (*p == m_cStringDelimiter)
>> +                        isQuoted = true;
>> +                    else if (*p == m_cFieldDelimiter)
>> +                        isFieldStarting = true;
>> +                }
>> +                else if (*p == m_cFieldDelimiter)
>> +                    isFieldStarting = true;
>> +            }
>> +            ++p;
>> +        }
>> +
>> +        if (wasQuote)
>> +            isQuoted = false;
>> +
>> +        if (isQuoted)
>>          {
>> -            line.GetString().Append('\n');
>> -            line.GetString() += sLine.GetString();
>> -            sLine = line;
>> +            nLastOffset = sLine.Len();
>> +            m_pFileStream->ReadByteStringLine(sLine,nEncoding);
>> +            if ( !m_pFileStream->IsEof() )
>> +            {
>> +                line.GetString().Append('\n');
>> +                line.GetString() += sLine.GetString();
>> +                sLine = line;
>> +            }
>> +            else
>> +                break;
>>          }
>>          else
>>              break;
>>
>> Modified: openoffice/trunk/main/tools/source/stream/stream.cxx
>> URL:
>> http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/s
>> tream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
>> ========================================================================
>> ======
>> --- openoffice/trunk/main/tools/source/stream/stream.cxx (original)
>> +++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17
>> 16:44:43 2016
>> @@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String&
>>      {
>>          const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
>>          xub_StrLen nLastOffset = 0;
>> -        xub_StrLen nQuotes = 0;
>> +        bool isQuoted = false;
>> +        bool isFieldStarting = true;
>>          while (!IsEof() && rStr.Len() < STRING_MAXLEN)
>>          {
>> +            bool wasQuote = false;
>>              bool bBackslashEscaped = false;
>> -            const sal_Unicode *p, *pStart;
>> -            p = pStart = rStr.GetBuffer();
>> +            const sal_Unicode *p;
>> +            p = rStr.GetBuffer();
>>              p += nLastOffset;
>>              while (*p)
>>              {
>> -                if (nQuotes)
>> +                if (isQuoted)
>>                  {
>>                      if (*p == cFieldQuote && !bBackslashEscaped)
>> -                        ++nQuotes;
>> -                    else if (bAllowBackslashEscape)
>> +                        wasQuote = !wasQuote;
>> +                    else
>>                      {
>> -                        if (*p == '\\')
>> -                            bBackslashEscaped = !bBackslashEscaped;
>> -                        else
>> -                            bBackslashEscaped = false;
>> +                        if (bAllowBackslashEscape)
>> +                        {
>> +                            if (*p == '\\')
>> +                                bBackslashEscaped = !bBackslashEscaped;
>> +                            else
>> +                                bBackslashEscaped = false;
>> +                        }
>> +                        if (wasQuote)
>> +                        {
>> +                            wasQuote = false;
>> +                            isQuoted = false;
>> +                            if (lcl_UnicodeStrChr( pSeps, *p ))
>> +                                isFieldStarting = true;
>> +                        }
>>                      }
>>                  }
>> -                else if (*p == cFieldQuote && (p == pStart ||
>> -                            lcl_UnicodeStrChr( pSeps, p[-1])))
>> -                    nQuotes = 1;
>> -                // A quote character inside a field content does not
>> start
>> -                // a quote.
>> +                else
>> +                {
>> +                    if (isFieldStarting)
>> +                    {
>> +                        isFieldStarting = false;
>> +                        if (*p == cFieldQuote)
>> +                            isQuoted = true;
>> +                        else if (lcl_UnicodeStrChr( pSeps, *p ))
>> +                            isFieldStarting = true;
>> +                    }
>> +                    else if (lcl_UnicodeStrChr( pSeps, *p ))
>> +                        isFieldStarting = true;
>> +                }
>>                  ++p;
>>              }
>>
>> -            if (nQuotes % 2 == 0)
>> -                break;
>> -            else
>> +            if (wasQuote)
>> +                isQuoted = false;
>> +
>> +            if (isQuoted)
>>              {
>>                  nLastOffset = rStr.Len();
>>                  String aNext;
>> @@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String&
>>                  rStr += sal_Unicode(_LF);
>>                  rStr += aNext;
>>              }
>> +            else
>> +                break;
>>          }
>>      }
>>      return nError == SVSTREAM_OK;
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
> For additional commands, e-mail: dev-help@openoffice.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
For additional commands, e-mail: dev-help@openoffice.apache.org


RE: svn commit: r1739628 - in /openoffice/trunk/main: connectivity/source/drivers/flat/ETable.cxx tools/source/stream/stream.cxx

Posted by "Dennis E. Hamilton" <de...@acm.org>.
Does the rule about using "" to make a single quote inside a quoted field also apply?

 - Dennis

> -----Original Message-----
> From: damjan@apache.org [mailto:damjan@apache.org]
> Sent: Sunday, April 17, 2016 09:45
> To: commits@openoffice.apache.org
> Subject: svn commit: r1739628 - in /openoffice/trunk/main:
> connectivity/source/drivers/flat/ETable.cxx
> tools/source/stream/stream.cxx
> 
> Author: damjan
> Date: Sun Apr 17 16:44:43 2016
> New Revision: 1739628
> 
> URL: http://svn.apache.org/viewvc?rev=1739628&view=rev
> Log:
> Make CSV line parsers consistent with CSV field parsers.
> 
> Our CSV field parsing algorithms treats fields starting with a quote
> (immediately at the beginning of the row, or after the field delimiter)
> as
> quoted. A quoted field ends at the corresponding closing quote, and any
> remaining text between the closing quote and the next field delimeter or
> end
> of line is appended to the text already extracted from the field, but
> not
> processed further. Any quotes in this extra text are taken verbatim -
> they
> do not quote anything.
> 
> Our CSV line parsers were big hacks - they essentially read and
> concatenate
> lines until an even number of quote characters is found, and then feed
> this
> through the CSV field parsers.
> 
> This patch rewrites the line parsers to work exactly how the field
> parsers
> work. Text such as:
> "another" ",something else
> is now correctly parsed by both Calc and Base as:
> [another "],[something else]
> instead of breaking all further parsing.
> 
> Patch by: me
> 
> 
> Modified:
>     openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
>     openoffice/trunk/main/tools/source/stream/stream.cxx
> 
> Modified:
> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> URL:
> http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/d
> rivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
> ========================================================================
> ======
> --- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> (original)
> +++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx
> Sun Apr 17 16:44:43 2016
> @@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke
>          return sal_False;
> 
>      QuotedTokenizedString sLine = line; // check if the string
> continues on next line
> -    while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) !=
> 1 )
> +    xub_StrLen nLastOffset = 0;
> +    bool isQuoted = false;
> +    bool isFieldStarting = true;
> +    while (true)
>      {
> -        m_pFileStream->ReadByteStringLine(sLine,nEncoding);
> -        if ( !m_pFileStream->IsEof() )
> +        bool wasQuote = false;
> +        const sal_Unicode *p;
> +        p = sLine.GetString().GetBuffer();
> +        p += nLastOffset;
> +
> +        while (*p)
> +        {
> +            if (isQuoted)
> +            {
> +                if (*p == m_cStringDelimiter)
> +                    wasQuote = !wasQuote;
> +                else
> +                {
> +                    if (wasQuote)
> +                    {
> +                        wasQuote = false;
> +                        isQuoted = false;
> +                        if (*p == m_cFieldDelimiter)
> +                            isFieldStarting = true;
> +                    }
> +                }
> +            }
> +            else
> +            {
> +                if (isFieldStarting)
> +                {
> +                    isFieldStarting = false;
> +                    if (*p == m_cStringDelimiter)
> +                        isQuoted = true;
> +                    else if (*p == m_cFieldDelimiter)
> +                        isFieldStarting = true;
> +                }
> +                else if (*p == m_cFieldDelimiter)
> +                    isFieldStarting = true;
> +            }
> +            ++p;
> +        }
> +
> +        if (wasQuote)
> +            isQuoted = false;
> +
> +        if (isQuoted)
>          {
> -            line.GetString().Append('\n');
> -            line.GetString() += sLine.GetString();
> -            sLine = line;
> +            nLastOffset = sLine.Len();
> +            m_pFileStream->ReadByteStringLine(sLine,nEncoding);
> +            if ( !m_pFileStream->IsEof() )
> +            {
> +                line.GetString().Append('\n');
> +                line.GetString() += sLine.GetString();
> +                sLine = line;
> +            }
> +            else
> +                break;
>          }
>          else
>              break;
> 
> Modified: openoffice/trunk/main/tools/source/stream/stream.cxx
> URL:
> http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/s
> tream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff
> ========================================================================
> ======
> --- openoffice/trunk/main/tools/source/stream/stream.cxx (original)
> +++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17
> 16:44:43 2016
> @@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String&
>      {
>          const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
>          xub_StrLen nLastOffset = 0;
> -        xub_StrLen nQuotes = 0;
> +        bool isQuoted = false;
> +        bool isFieldStarting = true;
>          while (!IsEof() && rStr.Len() < STRING_MAXLEN)
>          {
> +            bool wasQuote = false;
>              bool bBackslashEscaped = false;
> -            const sal_Unicode *p, *pStart;
> -            p = pStart = rStr.GetBuffer();
> +            const sal_Unicode *p;
> +            p = rStr.GetBuffer();
>              p += nLastOffset;
>              while (*p)
>              {
> -                if (nQuotes)
> +                if (isQuoted)
>                  {
>                      if (*p == cFieldQuote && !bBackslashEscaped)
> -                        ++nQuotes;
> -                    else if (bAllowBackslashEscape)
> +                        wasQuote = !wasQuote;
> +                    else
>                      {
> -                        if (*p == '\\')
> -                            bBackslashEscaped = !bBackslashEscaped;
> -                        else
> -                            bBackslashEscaped = false;
> +                        if (bAllowBackslashEscape)
> +                        {
> +                            if (*p == '\\')
> +                                bBackslashEscaped = !bBackslashEscaped;
> +                            else
> +                                bBackslashEscaped = false;
> +                        }
> +                        if (wasQuote)
> +                        {
> +                            wasQuote = false;
> +                            isQuoted = false;
> +                            if (lcl_UnicodeStrChr( pSeps, *p ))
> +                                isFieldStarting = true;
> +                        }
>                      }
>                  }
> -                else if (*p == cFieldQuote && (p == pStart ||
> -                            lcl_UnicodeStrChr( pSeps, p[-1])))
> -                    nQuotes = 1;
> -                // A quote character inside a field content does not
> start
> -                // a quote.
> +                else
> +                {
> +                    if (isFieldStarting)
> +                    {
> +                        isFieldStarting = false;
> +                        if (*p == cFieldQuote)
> +                            isQuoted = true;
> +                        else if (lcl_UnicodeStrChr( pSeps, *p ))
> +                            isFieldStarting = true;
> +                    }
> +                    else if (lcl_UnicodeStrChr( pSeps, *p ))
> +                        isFieldStarting = true;
> +                }
>                  ++p;
>              }
> 
> -            if (nQuotes % 2 == 0)
> -                break;
> -            else
> +            if (wasQuote)
> +                isQuoted = false;
> +
> +            if (isQuoted)
>              {
>                  nLastOffset = rStr.Len();
>                  String aNext;
> @@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String&
>                  rStr += sal_Unicode(_LF);
>                  rStr += aNext;
>              }
> +            else
> +                break;
>          }
>      }
>      return nError == SVSTREAM_OK;



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@openoffice.apache.org
For additional commands, e-mail: dev-help@openoffice.apache.org