You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@stdcxx.apache.org by Martin Sebor <se...@roguewave.com> on 2007/09/05 03:14:19 UTC

Re: [PATCH] Use __rw_atomic_xxx() on Windows

What's the status of this? We need to decide if we can put this
in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
sure the new functions don't cause a performance regression in
basic_string. I.e., we need to see the before and after numbers.

Martin

Martin Sebor wrote:
> One concern I have is performance. Does replacing the intrinsics with
> out of line function call whose semantics the compiler has no idea
> about have any impact on the runtime efficiency of the generated code?
> I would be especially interested in "real life" scenarios such as the
> usage of the atomic operations in basic_string.
> 
> It would be good to see some before and after numbers. If you don't
> have all the platforms to run the test post your benchmark and Travis
> can help you put them together.
> 
> FYI, in case you're not aware of this (it's not immediately obvious),
> even though we define the full set of atomic operations (i.e., for all
> integer types) the library only uses the overloads for int and long.
> 
> Martin
> 
> Farid Zaripov wrote:
>>  Attached is a patch, adding __rw_atomic_{add|xchg}xx() functions on 
>> Win32/Win64 platforms.
>>
>>  ChangeLog:
>>  * msvc-7.0.config: Added AS config variable.
>>  * msvc-8.0-x64.config: Ditto.
>>  * filterdef.js: Added definition of the CustomFileDef class
>>  * projectdef.js (InitAsmTool): New function to init custom build rule 
>> for .asm files.
>>  * projects.js: Added definitions of the platform dependent files.
>>  * utilities.js: Read AS configuration variable from the .config file.
>>  * i86/atomic.asm: New file with definitions of the __rw_atomic_xxx() 
>> for Win32 platform.
>>  * i86_64/atomic.asm: New file with definitions of the 
>> __rw_atomic_xxx() for Windows/x64 platform.
>>  * _mutex.h: Removed all dependencies on InterlockedXXX() API functions.
>>  Use new __rw_atomic_xxx() functions instead of InterlockedXXX().
>>  * once.cpp [_WIN32 && _DLL]: Tell linker to export __atomic_xxx()
>>  functions, defined in .asm files.
>>
>> Farid.
>>
>>
>> ------------------------------------------------------------------------
>>
>> Index: etc/config/windows/filterdef.js
>> ===================================================================
>> --- etc/config/windows/filterdef.js    (revision 570339)
>> +++ etc/config/windows/filterdef.js    (working copy)
>> @@ -25,7 +25,7 @@
>>  
>>  var sourceFilterName = "Source Files";
>>  var sourceFilterUuid = "{4FC737F1-C7A5-4376-A066-2A32D752A2FF}";
>> -var sourceFilterExts = ".cpp;.cxx;.s";
>> +var sourceFilterExts = ".cpp;.cxx;.s;.asm";
>>  
>>  var headerFilterName = "Header Files";
>>  var headerFilterUuid = "{93995380-89BD-4b04-88EB-625FBE52EBFB}";
>> @@ -56,6 +56,21 @@
>>      return str;
>>  }
>>  
>> +//------------------------------------------------
>> +// CustomFileDef class
>> +//------------------------------------------------
>> +
>> +// CustomFileDef .ctor
>> +function CustomFileDef(filepath, platform, initfun)
>> +{
>> +    this.filepath = filepath;
>> +    this.platform = platform;
>> +    this.initfun  = initfun;
>> +}
>> +
>> +// global array with platform dependent files definitions
>> +var customFileDefs = new Array();
>> +
>>  // common macros
>>  var cmnMacros = new Array();
>>  
>> @@ -126,7 +141,29 @@
>>      var VCFile = filter.AddFile(filename);
>>      if (null != filetype && typeof(VCFile.FileType) != "undefined")
>>          VCFile.FileType = filetype;
>> -        +
>> +    var customFileDef = null;
>> +
>> +    if (!exclude)
>> +    {
>> +        // find the platform dependent file definition
>> +        for (var i = 0; i < customFileDefs.length; ++i)
>> +        {
>> +            var custFileDef = customFileDefs[i];
>> +            var pos = VCFile.FullPath.length - 
>> custFileDef.filepath.length;
>> +            if (0 <= pos && pos == 
>> VCFile.FullPath.indexOf(custFileDef.filepath))
>> +            {
>> +                customFileDef = custFileDef;
>> +                break;
>> +            }
>> +        }
>> +
>> +        // exclude this file from build if current platform
>> +        // is not custom file target platform
>> +        if (null != customFileDef && customFileDef.platform != PLATFORM)
>> +            exclude = true;
>> +    }
>> +         if (exclude)
>>      {
>>          var cfgs = VCFile.FileConfigurations;
>> @@ -144,6 +181,12 @@
>>              cfg.ExcludedFromBuild = exclude;
>>          }
>>      }
>> +    else if (null != customFileDef &&
>> +             "undefined" != typeof(customFileDef.initfun))
>> +    {
>> +        // init
>> +        customFileDef.initfun(VCFile);
>> +    }
>>  }
>>  
>>  // create VCFilter object from the FilterDef definition
>> Index: etc/config/windows/msvc-7.0.config
>> ===================================================================
>> --- etc/config/windows/msvc-7.0.config    (revision 570339)
>> +++ etc/config/windows/msvc-7.0.config    (working copy)
>> @@ -38,6 +38,7 @@
>>  CXX=cl
>>  LD=cl
>>  AR=lib
>> +AS=ml
>>  
>>  // Use singlethreaded or mutlithreaded CRT in 11s, 11d solution 
>> configurations
>>  // 0 for MS VisualStudio .NET and MS VisualStudio .NET 2003
>> Index: etc/config/windows/msvc-8.0-x64.config
>> ===================================================================
>> --- etc/config/windows/msvc-8.0-x64.config    (revision 570339)
>> +++ etc/config/windows/msvc-8.0-x64.config    (working copy)
>> @@ -1,2 +1,3 @@
>>  #include msvc-8.0
>>  PLATFORM=x64
>> +AS=ml64
>> Index: etc/config/windows/projectdef.js
>> ===================================================================
>> --- etc/config/windows/projectdef.js    (revision 570339)
>> +++ etc/config/windows/projectdef.js    (working copy)
>> @@ -941,3 +941,25 @@
>>  
>>      return projectDef;
>>  }
>> +
>> +// init custom build rule for .asm files
>> +function InitAsmTool(VCFile)
>> +{
>> +    var cfgs = VCFile.FileConfigurations;
>> +    for (var i = 1; i <= cfgs.Count; ++i)
>> +    {
>> +        var cfg = cfgs.Item(i);
>> +        if ((typeof(cfg.Tool.ToolKind) != "undefined" &&
>> +            cfg.Tool.ToolKind != "VCCustomBuildTool") ||
>> +            cfg.Tool.ToolName != "Custom Build Tool")
>> +        {
>> +            cfg.Tool = 
>> cfg.ProjectConfiguration.FileTools.Item("VCCustomBuildTool");
>> +        }
>> +
>> +        var tool = cfg.Tool;
>> +        tool.Description = "Compiling .asm file...";
>> +        tool.Outputs = "$(IntDir)\\$(InputName).obj";
>> +        tool.CommandLine = AS + " /c /nologo /Fo" + tool.Outputs +
>> +                           " /W3 /Zi /Ta" + VCFile.RelativePath;
>> +    }
>> +}
>> Index: etc/config/windows/projects.js
>> ===================================================================
>> --- etc/config/windows/projects.js    (revision 570339)
>> +++ etc/config/windows/projects.js    (working copy)
>> @@ -84,6 +84,10 @@
>>      projectDefs.push(new Array(configureDef));
>>  
>>  /////////////////////////////////////////////////////////////////////////////// 
>>
>> +    // add platform dependent files
>> +    customFileDefs.push(new CustomFileDef("i86\\atomic.asm", "Win32", 
>> InitAsmTool));
>> +    customFileDefs.push(new CustomFileDef("i86_64\\atomic.asm", 
>> "x64", InitAsmTool));
>> +
>>      var stdcxxDef = new ProjectDef(".stdcxx", typeLibrary);
>>      stdcxxDef.VCProjDir = ProjectsDir;
>>      stdcxxDef.FilterDefs.push(
>> Index: etc/config/windows/utilities.js
>> ===================================================================
>> --- etc/config/windows/utilities.js    (revision 570339)
>> +++ etc/config/windows/utilities.js    (working copy)
>> @@ -32,6 +32,7 @@
>>  var CXX = "cl";
>>  var LD = "cl";
>>  var AR = "lib";
>> +var AS = "ml";
>>  var SLNVER="8.00";
>>  var SLNCOMMENT="";
>>  var UNICODELOG = false;
>> @@ -112,6 +113,9 @@
>>          case "AR":
>>              AR = arr[2];
>>              break;
>> +        case "AS":
>> +            AS = arr[2];
>> +            break;
>>          case "SLNVER":
>>              SLNVER = arr[2];
>>              break;
>> @@ -179,6 +183,7 @@
>>      stream.WriteLine("  CXX=" + CXX);
>>      stream.WriteLine("  LD=" + LD);
>>      stream.WriteLine("  AR=" + AR);
>> +    stream.WriteLine("  AS=" + AS);
>>      stream.WriteLine("  SLNVER=" + SLNVER);
>>      stream.WriteLine("  SLNCOMMENT=" + SLNCOMMENT);
>>      stream.WriteLine("  UNICODELOG=" + UNICODELOG);
>> Index: include/rw/_mutex.h
>> ===================================================================
>> --- include/rw/_mutex.h    (revision 570431)
>> +++ include/rw/_mutex.h    (working copy)
>> @@ -140,15 +140,7 @@
>>  __declspec (dllimport) void __stdcall
>>  DeleteCriticalSection (_RTL_CRITICAL_SECTION*);
>>  
>> -__declspec (dllimport) long __stdcall
>> -InterlockedIncrement (_RWSTD_INTERLOCKED_T*);
>> -
>> -__declspec (dllimport) long __stdcall
>> -InterlockedDecrement (_RWSTD_INTERLOCKED_T*);
>> -
>> -__declspec (dllimport) long __stdcall
>> -InterlockedExchange (_RWSTD_INTERLOCKED_T*, long);
>> -
>> +_RWSTD_EXPORT int __rw_atomic_add32 (int*, int);
>>  }   // extern "C"
>>  
>>  _RWSTD_NAMESPACE (__rw) { @@ -478,11 +470,11 @@
>>      // implicit initialization used to prevent a g++ 2.95.2 warning 
>> on Tru64
>>      // sorry: semantics of inline function static data are wrong 
>> (you'll wind
>>      // up with multiple copies)
>> -    static volatile long __cntr /* = 0 */;   // initialization counter
>> +    static volatile int __cntr /* = 0 */;   // initialization counter
>>  
>>  #if defined (_WIN32) || defined (_WIN64)
>>      // MT safe
>> -    if (0 == __cntr && 1 == InterlockedIncrement ((long*)&__cntr))
>> +    if (0 == __cntr && 1 == __rw_atomic_add32 ((int*)&__cntr, +1))
>>  #else
>>      // not so safe (volatile should help)
>>      if (0 == __cntr && 1 == ++__cntr)
>> @@ -1161,19 +1153,20 @@
>>                                          false);
>>  }  
>> -/********************** i386/gcc **************************************/
>> +/********************** i386/gcc || _M_IX86 
>> *********************************/
>>  
>> -#elif defined (__i386__) && (defined (__GNUG__) || defined 
>> (__INTEL_COMPILER))
>> +#elif defined (__i386__) && (defined (__GNUG__) \
>> +   || defined (__INTEL_COMPILER)) || defined (_M_IX86)
>>  
>>  extern "C" {
>>  
>> -char __rw_atomic_add8 (char*, int);
>> -short __rw_atomic_add16 (short*, short);
>> -int __rw_atomic_add32 (int*, int);
>> +_RWSTD_EXPORT char __rw_atomic_add8 (char*, int);
>> +_RWSTD_EXPORT short __rw_atomic_add16 (short*, short);
>> +_RWSTD_EXPORT int __rw_atomic_add32 (int*, int);
>>  
>> -char __rw_atomic_xchg8 (char*, char);
>> -short __rw_atomic_xchg16 (short*, short);
>> -int __rw_atomic_xchg32 (int*, int);
>> +_RWSTD_EXPORT char __rw_atomic_xchg8 (char*, char);
>> +_RWSTD_EXPORT short __rw_atomic_xchg16 (short*, short);
>> +_RWSTD_EXPORT int __rw_atomic_xchg32 (int*, int);
>>  
>>  }   // extern "C"
>>  
>> @@ -1349,85 +1342,39 @@
>>                                 _RWSTD_STATIC_CAST (int, __y));
>>  }
>>  
>> +/********************** IA64/x86_64/_M_X64 
>> *****************************/
>>  
>> -/********************** WIN 32/64 ************************************/
>> +#elif defined (__ia64) || defined (__x86_64) || defined (_M_X64)
>>  
>> -#elif defined (_WIN32)
>> +extern "C" {
>>  
>> -// Interlocked[In|De]crement functions atomically modify their argument
>> -// and return the new value
>> +_RWSTD_EXPORT _RWSTD_INT8_T
>> +__rw_atomic_xchg8  (_RWSTD_INT8_T*,  _RWSTD_INT8_T);
>>  
>> -// InterlockedExchange atomically sets the value pointed to by the first
>> -// argument to that of the second argument and returns the original 
>> value
>> +_RWSTD_EXPORT _RWSTD_INT16_T
>> +__rw_atomic_xchg16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>>  
>> -inline int
>> -__rw_atomic_preincrement (int &__x, bool)
>> -{
>> -    _RWSTD_COMPILE_ASSERT (sizeof __x == sizeof (long));
>> -    return InterlockedIncrement (_RWSTD_REINTERPRET_CAST (long*, &__x));
>> -}
>> +_RWSTD_EXPORT _RWSTD_INT32_T
>> +__rw_atomic_xchg32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>>  
>>  
>> -inline unsigned int
>> -__rw_atomic_preincrement (unsigned int &__x, bool)
>> -{
>> -    return __rw_atomic_preincrement (_RWSTD_REINTERPRET_CAST (int&, 
>> __x),
>> -                                     false);
>> -}
>> +_RWSTD_EXPORT _RWSTD_INT8_T
>> +__rw_atomic_add8  (_RWSTD_INT8_T*,  _RWSTD_INT8_T);
>>  
>> +_RWSTD_EXPORT _RWSTD_INT16_T
>> +__rw_atomic_add16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>>  
>> -inline int
>> -__rw_atomic_predecrement (int &__x, bool)
>> -{
>> -    _RWSTD_COMPILE_ASSERT (sizeof __x == sizeof (long));
>> -    return InterlockedDecrement (_RWSTD_REINTERPRET_CAST (long*, &__x));
>> -}
>> +_RWSTD_EXPORT _RWSTD_INT32_T
>> +__rw_atomic_add32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>>  
>> -
>> -inline unsigned int
>> -__rw_atomic_predecrement (unsigned int &__x, bool)
>> -{
>> -    return __rw_atomic_predecrement (_RWSTD_REINTERPRET_CAST (int&, 
>> __x),
>> -                                     false);
>> -}
>> -
>> -
>> -inline int
>> -__rw_atomic_exchange (int &__x, int __y, bool)
>> -{
>> -    _RWSTD_COMPILE_ASSERT (sizeof __x == sizeof (long));
>> -    return InterlockedExchange (_RWSTD_REINTERPRET_CAST (long*, &__x),
>> -                                _RWSTD_STATIC_CAST (long, __y));
>> -}
>> -
>> -
>> -inline unsigned int
>> -__rw_atomic_exchange (unsigned int &__x, unsigned int __y, bool)
>> -{
>> -    return __rw_atomic_exchange (_RWSTD_REINTERPRET_CAST (int&, __x),
>> -                                 _RWSTD_STATIC_CAST (int, __y), false);
>> -}
>> -
>> -/********************** IA64/x86_64 ***********************************/
>> -
>> -#elif defined (__ia64) || defined (__x86_64)
>> -
>> -extern "C" {
>> -
>> -_RWSTD_INT8_T  __rw_atomic_xchg8  (_RWSTD_INT8_T*,  _RWSTD_INT8_T);
>> -_RWSTD_INT16_T __rw_atomic_xchg16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>> -_RWSTD_INT32_T __rw_atomic_xchg32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>> -
>> -
>> -_RWSTD_INT8_T  __rw_atomic_add8  (_RWSTD_INT8_T*,  _RWSTD_INT8_T);
>> -_RWSTD_INT16_T __rw_atomic_add16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>> -_RWSTD_INT32_T __rw_atomic_add32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>> -
>>  #ifdef _RWSTD_INT64_T
>>  
>> -_RWSTD_INT64_T __rw_atomic_xchg64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>> -_RWSTD_INT64_T __rw_atomic_add64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>> +_RWSTD_EXPORT _RWSTD_INT64_T
>> +__rw_atomic_xchg64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>>  
>> +_RWSTD_EXPORT _RWSTD_INT64_T
>> +__rw_atomic_add64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>> +
>>  #endif   // _RWSTD_INT64_T
>>  
>>  }   // extern "C"
>> Index: src/i86/atomic.asm
>> ===================================================================
>> --- src/i86/atomic.asm    (revision 0)
>> +++ src/i86/atomic.asm    (revision 0)
>> @@ -0,0 +1,178 @@
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +;
>> +; i86/atomic.asm
>> +;
>> +; $Id$
>> +;
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +;
>> +; Licensed to the Apache Software  Foundation (ASF) under one or more
>> +; contributor  license agreements.  See  the NOTICE  file distributed
>> +; with  this  work  for  additional information  regarding  copyright
>> +; ownership.   The ASF  licenses this  file to  you under  the Apache
>> +; License, Version  2.0 (the  "License"); you may  not use  this file
>> +; except in  compliance with the License.   You may obtain  a copy of
>> +; the License at
>> +;
>> +; http://www.apache.org/licenses/LICENSE-2.0
>> +;
>> +; Unless required by applicable law or agreed to in writing, software
>> +; distributed under the  License is distributed on an  "AS IS" BASIS,
>> +; WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
>> +; implied.   See  the License  for  the  specific language  governing
>> +; permissions and limitations under the License.
>> +;
>> +; Copyright 2003-2006 Rogue Wave Software.
>> +; 
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +
>> +                .486
>> +                .model flat
>> +                .code
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" char __rw_atomic_xchg8 (char *x, char y);
>> +;
>> +; Atomically assigns the 8-bit value y to *x and returns
>> +; the original (before assignment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 4
>> +                public ___rw_atomic_xchg8
>> +___rw_atomic_xchg8 proc                     ; char (char *x, char y)
>> +
>> +arg_x           = dword ptr  4
>> +arg_y           = byte ptr  8
>> +                                            +                mov     
>> ecx, [esp+arg_x]    ; %ecx = x
>> +                mov     al,  [esp+arg_y]    ; %al = y
>> +                xchg    al,  [ecx]          ; %al <-> (%ecx)
>> +                ret                        +___rw_atomic_xchg8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" short __rw_atomic_xchg16 (short *x, short y);
>> +;
>> +; Atomically assigns the 16-bit value y to *x and returns
>> +; the original (before assignment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 4
>> +                public ___rw_atomic_xchg16
>> +___rw_atomic_xchg16 proc                    ; short (short *x, short y)
>> +
>> +arg_x           = dword ptr  4
>> +arg_y           = word ptr  8
>> +                                            +                mov     
>> ecx, [esp+arg_x]    ; %ecx = x                 +                
>> mov     ax,  [esp+arg_y]    ; %eax = y                 
>> +                xchg    ax,  [ecx]          ; %ax <-> 
>> (%ecx)           +                ret
>> +___rw_atomic_xchg16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int __rw_atomic_xchg32 (int *x, int y);
>> +;
>> +; Atomically assigns the 32-bit value y to *x and returns
>> +; the original (before assignment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 4
>> +                public ___rw_atomic_xchg32
>> +___rw_atomic_xchg32 proc                    ; int (int *x, int y)
>> +
>> +arg_x           = dword ptr  4
>> +arg_y           = dword ptr  8
>> +                                            +                mov     
>> ecx, [esp+arg_x]    ; %ecx = x
>> +                mov     eax, [esp+arg_y]    ; %eax = y
>> +                xchg    eax, [ecx]          ; %eax <-> (%ecx)
>> +                ret
>> +___rw_atomic_xchg32 endp
>> +
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" char __rw_atomic_add8 (char *x, int y);
>> +;
>> +; Atomically increments the 8-bit value *x by y and returns
>> +; the new (after increment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 4
>> +                public ___rw_atomic_add8
>> +___rw_atomic_add8 proc                       ; char (char *dst, int inc)
>> +
>> +arg_dst         = dword ptr  4
>> +arg_inc         = dword ptr  8
>> +                                           +                mov     
>> ecx, [esp+arg_dst]   ; %ecx = dst               +                
>> mov     eax, [esp+arg_inc]   ; %eax = inc               
>> +                mov     edx, eax           +
>> +                lock xadd [ecx], al          ; tmp = *dst;
>> +                                             ; dst += inc;
>> +                                             ; %al = 
>> tmp                +
>> +                add     eax, edx             ; return %eax + 
>> inc        +                ret
>> +___rw_atomic_add8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" short __rw_atomic_add16 (short *x, short y);
>> +;
>> +; Atomically increments the 16-bit value *x by y and returns
>> +; the new (after increment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 4
>> +                public ___rw_atomic_add16
>> +___rw_atomic_add16 proc                      ; short (short *dst, 
>> short inc)
>> +
>> +arg_dst         = dword ptr  4
>> +arg_inc         = dword ptr  8
>> +
>> +                mov     ecx, [esp+arg_dst]   ; %ecx = 
>> dst               +                mov     eax, [esp+arg_inc]   ; %eax 
>> = inc               +                mov     edx, eax           
>> +                                          +                lock xadd 
>> [ecx], ax          ; tmp = *dst;
>> +                                             ; dst += inc;
>> +                                             ; %ax = 
>> tmp                +                                          
>> +                add     eax, edx             ; return %eax + 
>> inc        +                ret
>> +___rw_atomic_add16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int __rw_atomic_add32 (int *x, int y);
>> +;
>> +; Atomically increments the 32-bit value *x by y and returns
>> +; the new (after increment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 4
>> +                public ___rw_atomic_add32
>> +___rw_atomic_add32 proc                      ; int (int *dst, int inc)
>> +
>> +arg_dst         = dword ptr  4
>> +arg_inc         = dword ptr  8
>> +
>> +                mov     ecx, [esp+arg_dst]   ; %ecx = 
>> dst               +                mov     edx, [esp+arg_inc]   ; %edx 
>> = inc               +                mov     eax, edx         
>> +                                        +                lock xadd 
>> [ecx], eax         ; tmp = *dst;
>> +                                             ; dst += inc;
>> +                                             ; %eax = 
>> tmp                +                                        
>> +                add     eax, edx             ; return %eax + 
>> inc        +                ret
>> +___rw_atomic_add32 endp
>> +
>> +                end
>>
>> Property changes on: src\i86\atomic.asm
>> ___________________________________________________________________
>> Name: svn:keywords
>>    + Id
>> Name: svn:eol-style
>>    + native
>>
>> Index: src/i86_64/atomic.asm
>> ===================================================================
>> --- src/i86_64/atomic.asm    (revision 0)
>> +++ src/i86_64/atomic.asm    (revision 0)
>> @@ -0,0 +1,186 @@
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +;
>> +; i86_64/atomic.asm
>> +;
>> +; $Id$
>> +;
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +;
>> +; Licensed to the Apache Software  Foundation (ASF) under one or more
>> +; contributor  license agreements.  See  the NOTICE  file distributed
>> +; with  this  work  for  additional information  regarding  copyright
>> +; ownership.   The ASF  licenses this  file to  you under  the Apache
>> +; License, Version  2.0 (the  "License"); you may  not use  this file
>> +; except in  compliance with the License.   You may obtain  a copy of
>> +; the License at
>> +;
>> +; http://www.apache.org/licenses/LICENSE-2.0
>> +;
>> +; Unless required by applicable law or agreed to in writing, software
>> +; distributed under the  License is distributed on an  "AS IS" BASIS,
>> +; WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
>> +; implied.   See  the License  for  the  specific language  governing
>> +; permissions and limitations under the License.
>> +;
>> +; Copyright 2003-2006 Rogue Wave Software.
>> +; 
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +
>> +                .code
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int8_t __rw_atomic_xchg8 (int8_t *x, int8_t y);
>> +;
>> +; Atomically assigns the 8-bit value y to *x and returns
>> +; the original (before assignment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_xchg8
>> +__rw_atomic_xchg8 proc                    ; int8_t (int8_t *x, int8_t y)
>> +                                          ; %rcx = x
>> +                mov     al, dl            ; %al = y +                
>> xchg    al, [rcx]         ; %al <-> (%rcx)
>> +                ret
>> +__rw_atomic_xchg8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int16_t __rw_atomic_xchg16 (int16_t *x, int16_t y);
>> +;
>> +; Atomically assigns the 16-bit value y to *x and returns
>> +; the original (before assignment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_xchg16
>> +__rw_atomic_xchg16 proc                   ; int16_t (int16_t *x, 
>> int16_t y)
>> +                                          ; %rcx = x
>> +                mov     ax, dx            ; %ax = y
>> +                xchg    ax, [rcx]         ; %ax <-> (%rcx)
>> +                ret
>> +__rw_atomic_xchg16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int32_t __rw_atomic_xchg32 (int32_t *x, int32_t y);
>> +;
>> +; Atomically assigns the 32-bit value y to *x and returns
>> +; the original (before assignment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_xchg32
>> +__rw_atomic_xchg32 proc                   ; int32_t (int32_t *x, 
>> int32_t y)
>> +                                          ; %rcx = x
>> +                mov     eax, edx          ; %eax = y
>> +                xchg    eax, [rcx]        ; %eax <-> (%rcx)
>> +                ret
>> +__rw_atomic_xchg32 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int32_t __rw_atomic_xchg64 (int64_t *x, int64_t y);
>> +;
>> +; Atomically assigns the 64-bit value y to *x and returns
>> +; the original (before assignment) 64-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_xchg64
>> +__rw_atomic_xchg64 proc                   ; int64_t (int64_t *x, 
>> int64_t y)
>> +                                          ; %rcx = x
>> +                mov     rax, rdx          ; %rax = y
>> +                xchg    rax, [rcx]        ; %rax <-> (%rcx)
>> +                ret
>> +__rw_atomic_xchg64 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int8_t __rw_atomic_add8 (int8_t *x, int8_t y);
>> +;
>> +; Atomically increments the 8-bit value *x by y and returns
>> +; the new (after increment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_add8
>> +__rw_atomic_add8 proc                     ; int8_t (int8_t *dst, 
>> int8_t inc)
>> +                                          ; %rcx = dst
>> +                mov     eax, edx          ; %eax = inc
>> +
>> +                lock xadd [rcx], al       ; tmp = *dst
>> +                                          ; dst += inc      
>> +                                          ; %al = tmp       
>> +                add     eax, edx          ; return %al + inc
>> +                ret
>> +__rw_atomic_add8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int16_t __rw_atomic_add16 (int16_t *x, int16_t y);
>> +;
>> +; Atomically increments the 16-bit value *x by y and returns
>> +; the new (after increment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_add16
>> +__rw_atomic_add16 proc                    ; int16_t (int16_t *dst, 
>> int16_t inc)
>> +                                          ; %rcx = dst
>> +                mov     ax,  dx           ; %ax = inc +
>> +                lock xadd [rcx], ax       ; tmp = *dst
>> +                                          ; dst += inc
>> +                                          ; eax = tmp +
>> +                add     ax,  dx           ; return %ax + inc
>> +                ret
>> +__rw_atomic_add16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int32_t __rw_atomic_add32 (int32_t *x, int32_t y);
>> +;
>> +; Atomically increments the 32-bit value *x by y and returns
>> +; the new (after increment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_add32
>> +__rw_atomic_add32 proc                    ; int32_t (int32_t *dst, 
>> int32_t inc)
>> +                                          ; %rcx = dst
>> +                mov     eax, edx          ; %eax = inc
>> +
>> +                lock xadd [rcx], eax      ; tmp = *dst
>> +                                          ; dst += inc
>> +                                          ; %eax = tmp
>> +
>> +                add     eax, edx          ; return %eax + inc
>> +                ret
>> +__rw_atomic_add32 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +; extern "C" int64_t __rw_atomic_add64 (int64_t *x, int64_t y);
>> +;
>> +; Atomically increments the 32-bit value *x by y and returns
>> +; the new (after increment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
>>
>> +
>> +                align 16
>> +                public __rw_atomic_add64
>> +__rw_atomic_add64 proc                    ; int64_t (int64_t *dst, 
>> int64_t inc)
>> +                                          ; %rcx = dst
>> +                mov     rax, rdx          ; %eax = inc
>> +
>> +                lock xadd [rcx], rax      ; tmp = *dst
>> +                                          ; dst += inc
>> +                                          ; %eax = tmp
>> +
>> +                add     rax, rdx          ; return %eax + inc
>> +                ret
>> +__rw_atomic_add64 endp
>> +
>> +                end
>>
>> Property changes on: src\i86_64\atomic.asm
>> ___________________________________________________________________
>> Name: svn:keywords
>>    + Id
>> Name: svn:eol-style
>>    + native
>>
>> Index: src/once.cpp
>> ===================================================================
>> --- src/once.cpp    (revision 570339)
>> +++ src/once.cpp    (working copy)
>> @@ -188,3 +188,32 @@
>>  }   // extern "C"
>>  
>>  }   // namespace __rw
>> +
>> +// export __rw_atomic_xxx() functions, defined in atomic.asm
>> +#if defined (_WIN32) && defined (_DLL)
>> +
>> +#  if defined (_M_IX86)
>> +
>> +#    pragma comment(linker, "/EXPORT:___rw_atomic_add8")
>> +#    pragma comment(linker, "/EXPORT:___rw_atomic_add16")
>> +#    pragma comment(linker, "/EXPORT:___rw_atomic_add32")
>> +#    pragma comment(linker, "/EXPORT:___rw_atomic_xchg8")
>> +#    pragma comment(linker, "/EXPORT:___rw_atomic_xchg16")
>> +#    pragma comment(linker, "/EXPORT:___rw_atomic_xchg32")
>> +
>> +#  elif defined (_M_X64)
>> +
>> +#    pragma comment(linker, "/EXPORT:__rw_atomic_add8")
>> +#    pragma comment(linker, "/EXPORT:__rw_atomic_add16")
>> +#    pragma comment(linker, "/EXPORT:__rw_atomic_add32")
>> +#    pragma comment(linker, "/EXPORT:__rw_atomic_xchg8")
>> +#    pragma comment(linker, "/EXPORT:__rw_atomic_xchg16")
>> +#    pragma comment(linker, "/EXPORT:__rw_atomic_xchg32")
>> +
>> +#    ifdef _RWSTD_INT64_T
>> +#      pragma comment(linker, "/EXPORT:__rw_atomic_add64")
>> +#      pragma comment(linker, "/EXPORT:__rw_atomic_xchg64")
>> +#    endif   // _RWSTD_INT64_T
>> +#  endif   // _M_IX86
>> +
>> +#endif   // _WIN32 && _DLL
>

Re: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Martin Sebor <se...@roguewave.com>.

Farid Zaripov wrote:
>> -----Original Message-----
>> From: Martin Sebor [mailto:sebor@roguewave.com] 
>> Sent: Thursday, September 06, 2007 5:49 AM
>> To: stdcxx-dev@incubator.apache.org
>> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>>
>> Travis Vitek wrote:
>>> Oh, yeah. that is the other thing that I did Friday. I wrote a 
>>> testcase to compare __rw_atomic_add32() against 
>> InterlockedIncrement() on Win32.
>>> There is a performance penalty...
>> I'd be curious to know if the performance penalty is due to 
>> the function call overhead or something else.
>>
>> In any case though, I think we could tweak the patch and 
>> change the __rw_atomic_pre{de,in}crement() overloads for int 
>> and long to call the appropriate Interlocked{De,In}crement() 
>> intrinsics and have the other overloads use the new ones.
>>
>> Farid, what do you think about this approach?
> 
>   I agree. And I decided to make the changes above
> without making the tests to see the performance penalty.

Okay, that sounds like a good approach to me. You can still
commit the int and long atomic functions, we just won't call
them.

Martin

RE: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Farid Zaripov <Fa...@epam.com>.

> -----Original Message-----
> From: Martin Sebor [mailto:sebor@roguewave.com] 
> Sent: Thursday, September 06, 2007 5:49 AM
> To: stdcxx-dev@incubator.apache.org
> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
> 
> Travis Vitek wrote:
> > Oh, yeah. that is the other thing that I did Friday. I wrote a 
> > testcase to compare __rw_atomic_add32() against 
> InterlockedIncrement() on Win32.
> > There is a performance penalty...
> 
> I'd be curious to know if the performance penalty is due to 
> the function call overhead or something else.
> 
> In any case though, I think we could tweak the patch and 
> change the __rw_atomic_pre{de,in}crement() overloads for int 
> and long to call the appropriate Interlocked{De,In}crement() 
> intrinsics and have the other overloads use the new ones.
> 
> Farid, what do you think about this approach?

  I agree. And I decided to make the changes above
without making the tests to see the performance penalty.

Farid.

Re: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Martin Sebor <se...@roguewave.com>.

Travis Vitek wrote:
> Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
> to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
> There is a performance penalty...

I'd be curious to know if the performance penalty is due to the
function call overhead or something else.

In any case though, I think we could tweak the patch and change
the __rw_atomic_pre{de,in}crement() overloads for int and long
to call the appropriate Interlocked{De,In}crement() intrinsics
and have the other overloads use the new ones.

Farid, what do you think about this approach?

Martin

> 
>   C:\Temp>t 2 && t 4 && t 8
>   ---------- locked inc ---- atomic_add ---- 2 threads
>   ms               4266            4469
>   ms/op      0.00003178      0.00003330      -4.7586%
>   thr ms          18117           18437
>   thr ms/op  0.00013498      0.00013737      -1.7663%
>   ---------- locked inc ---- atomic_add ---- 4 threads
>   ms               7969            8609
>   ms/op      0.00005937      0.00006414      -8.0311%
>   thr ms          36359           37019
>   thr ms/op  0.00027090      0.00027581      -1.8152%
>   ---------- locked inc ---- atomic_add ---- 8 threads
>   ms               5016            5484
>   ms/op      0.00003737      0.00004086      -9.3301%
>   thr ms          60846           66130
>   thr ms/op  0.00045334      0.00049271      -8.6842%
> 
>   C:\Temp>t 2 && t 4 && t 8
>   ---------- locked inc ---- atomic_add ---- 2 threads
>   ms               2781            2906
>   ms/op      0.00002072      0.00002165      -4.4948%
>   thr ms          14961           16093
>   thr ms/op  0.00011147      0.00011990      -7.5663%
>   ---------- locked inc ---- atomic_add ---- 4 threads
>   ms               2781            2891
>   ms/op      0.00002072      0.00002154      -3.9554%
>   thr ms          30867           31328
>   thr ms/op  0.00022998      0.00023341      -1.4935%
>   ---------- locked inc ---- atomic_add ---- 8 threads
>   ms               2782            2890
>   ms/op      0.00002073      0.00002153      -3.8821%
>   thr ms          64318           64341
>   thr ms/op  0.00047921      0.00047938      -0.0358%
> 
> I will do a quick run using the string performance test after lunch.
> I'll report the results on that later. I've pasted the source for the
> bulk of my test below. If someone wants the entire thing, let me know
> and I'll provide everything.
> 
> Travis
> 
> 
> Martin Sebor wrote:
>> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>>
>> What's the status of this? We need to decide if we can put this
>> in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
>> sure the new functions don't cause a performance regression in
>> basic_string. I.e., we need to see the before and after numbers.
>>
>> Martin
>>
>> Martin Sebor wrote:
>>> One concern I have is performance. Does replacing the intrinsics with
>>> out of line function call whose semantics the compiler has no idea
>>> about have any impact on the runtime efficiency of the 
>> generated code?
>>> I would be especially interested in "real life" scenarios such as the
>>> usage of the atomic operations in basic_string.
>>>
>>> It would be good to see some before and after numbers. If you don't
>>> have all the platforms to run the test post your benchmark and Travis
>>> can help you put them together.
> 
> #include <stdio.h>
> #include <stdlib.h>
> 
> #define WIN32_LEAN_AND_MEAN
> #include <windows.h>
> #include <process.h>
> 
> #include "lib.h"
> 
> #define MIN_THREADS 2
> #define MAX_THREADS 16
> 
> unsigned long locked_inc(long* val, long iters)
> {
>     const unsigned long t0 = GetTickCount ();
> 
>     long n;
>     for (n = 0; n < iters; ++n)
>     {
>         InterlockedIncrement(val);
>     }
> 
>     const unsigned long t1 = GetTickCount ();
> 
>     return (t1 - t0);
> }
> 
> unsigned long atomic_add(long* val, long iters)
> {
>     const unsigned long t0 = GetTickCount ();
> 
>     long n;
>     for (n = 0; n < iters; ++n)
>     {
>         __rw_atomic_add32(val, 1);
>     }
> 
>     const unsigned long t1 = GetTickCount ();
> 
>     return (t1 - t0);
> }
> 
> struct thread_param {
> 
>     // atomic variable
>     long* variable;
> 
>     // number of iterations
>     long iters;
> 
>     // function to invoke
>     unsigned long (*fun)(long*, long);
> 
>     // result of function
>     unsigned long result;
> 
>     // thread handle used by main thread
>     HANDLE thread;
> };
> 
> extern "C" {
> 
>     void thread_func(void* p)
>     {
>         thread_param* param = (thread_param*)p;
>         param->result = (param->fun)(param->variable, param->iters);
>     }
> 
> } // extern "C"
> 
> 
> unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
> long), long iters)
> {
>     thread_param params[MAX_THREADS];
>     long thread_var = 0;
> 
>     int i;
>     for (i = 0; i < nthreads; ++i) {
>         params[i].variable = &thread_var;
>         params[i].result   = 0;
>         params[i].fun      = fun;
>         params[i].iters    = iters;
>     }
> 
>     int n;
>     for (n = 0; n < nthreads; ++n) {
>         params[n].thread = (HANDLE)_beginthread(thread_func, 0,
> &params[n]);
>     }
> 
>     unsigned long thread_time = 0;
> 
>     for (n = 0; n < nthreads; ++n) {
>         WaitForSingleObject (params[n].thread, INFINITE);
>         thread_time += params[n].result;
>     }
> 
>     return thread_time;
> }
> 
> 
> int main(int argc, char* argv[])
> {
>     int nthreads = MIN_THREADS;
>     if (1 < argc)
>         nthreads = atoi(argv[1]);
> 
>     // cap thread count
>     if (nthreads < MIN_THREADS)
>         nthreads = MIN_THREADS;
>     else if (MAX_THREADS < nthreads)
>         nthreads = MAX_THREADS;
> 
>     const long ops = 0x7ffffff;
>     long thread_var;
>     
>     thread_var = 0;
>     unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
>     
>     thread_var = 0;
>     unsigned long atomic_add_ms = atomic_add (&thread_var, ops);
> 
>     printf("---------- locked inc ---- atomic_add ---- %d threads\n",
> nthreads);
>     printf("ms           %8.u        %8.u\n", locked_inc_ms,
> atomic_add_ms);
> 
>     float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
>     float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
> 
>     printf("ms/op      %8.8f      %8.8f      %.4f%%\n", 
>         locked_inc_ops_p_ms, atomic_add_ops_p_ms,
>         100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
> locked_inc_ops_p_ms);
> 
>     // do it with threads
> 
>     locked_inc_ms = run_threads(nthreads, locked_inc, ops);
>     atomic_add_ms = run_threads(nthreads, atomic_add, ops);
> 
>     locked_inc_ms /= nthreads;
>     atomic_add_ms /= nthreads;
> 
>     printf("thr ms       %8.u        %8.u\n", locked_inc_ms,
> atomic_add_ms);
> 
>     locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
>     atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
> 
>      printf("thr ms/op  %8.8f      %8.8f      %.4f%%\n", 
>         locked_inc_ops_p_ms, atomic_add_ops_p_ms,
>         100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
> locked_inc_ops_p_ms);
> 
>     return 0;
> }

Re: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Martin Sebor <se...@roguewave.com>.

Travis Vitek wrote:
> Doh! I should know better. Here is the results from a 12d build on the
> same hardware.

Does this mean that there is almost no difference between the
intrinsic functions and the out of line ones, or that the test
is too simple to demonstrate them?

I expect the greatest advantage of the intrinsics over ordinary 
out-of-line functions to be that they (might) make it possible
for the optimizer to generate better code *in certain contexts*
depending on from where they are called. This is going to be
hard to demonstrate in a simple test case. I suspect we would
need a more realistic test with a number of different uses of
string (and the atomic functions) to get some idea of how much
they might help.

Martin

> 
>   normal              patched
>   ------  1 threads   ------  1 threads
>   ms            934   ms           1015
>   ms/op  0.00005567   ms/op  0.00006050
>   ------  2 threads   ------  2 threads
>   ms           6049   ms           6266
>   ms/op  0.00036055   ms/op  0.00037348
>   ------  4 threads   ------  4 threads
>   ms          11948   ms          11813
>   ms/op  0.00071216   ms/op  0.00070411
>   ------  8 threads   ------  8 threads
>   ms          23855   ms          24743
>   ms/op  0.00142187   ms/op  0.00147480 
> 
> 
> 
> Martin Sebor wrote:
>> 8d is not thread-safe so the atomic function templates should
>> be implemented in terms of ordinary increments and decrements
>> (if they aren't it's a bug). They should only expand to the
>> atomic assembly (or the Win32 Interlocked) functions in 12X
>> and 15X build types.
>>
>> Martin
>>

RE: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Travis Vitek <tv...@quovadx.com>.

Doh! I should know better. Here is the results from a 12d build on the
same hardware.

  normal              patched
  ------  1 threads   ------  1 threads
  ms            934   ms           1015
  ms/op  0.00005567   ms/op  0.00006050
  ------  2 threads   ------  2 threads
  ms           6049   ms           6266
  ms/op  0.00036055   ms/op  0.00037348
  ------  4 threads   ------  4 threads
  ms          11948   ms          11813
  ms/op  0.00071216   ms/op  0.00070411
  ------  8 threads   ------  8 threads
  ms          23855   ms          24743
  ms/op  0.00142187   ms/op  0.00147480 



Martin Sebor wrote:
>
>8d is not thread-safe so the atomic function templates should
>be implemented in terms of ordinary increments and decrements
>(if they aren't it's a bug). They should only expand to the
>atomic assembly (or the Win32 Interlocked) functions in 12X
>and 15X build types.
>
>Martin
>

Re: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Martin Sebor <se...@roguewave.com>.

Travis Vitek wrote:
> Since we don't have a string perf test that I could find, I wrote up a
> quick and dirty one that just made many copies of the same string
> repeatedly to exercise the atomic increment/decrement. The results show
> a 3% performance penalty when using the newer atomic functions. This
> test was run with an 8d configuration, so the atomic functions were
> compiled into the stdcxx dll. The test hardware is a Lenovo T60p [Intel
> Core 2 T7600 2.33GHz CPU, 2GB RAM].

8d is not thread-safe so the atomic function templates should
be implemented in terms of ordinary increments and decrements
(if they aren't it's a bug). They should only expand to the
atomic assembly (or the Win32 Interlocked) functions in 12X
and 15X build types.

Martin

> 
>   Old                new [patched]
>   ------  1 threads  ------  1 threads
>   ms            714  ms            737
>   ms/op  0.00004256  ms/op  0.00004393
>   ------  2 threads  ------  2 threads
>   ms           3911  ms           4024
>   ms/op  0.00023311  ms/op  0.00023985
>   ------  4 threads  ------  4 threads
>   ms           7660  ms           7865
>   ms/op  0.00045657  ms/op  0.00046879
>   ------  8 threads  ------  8 threads
>   ms          15192  ms          15585
>   ms/op  0.00090551  ms/op  0.00092894
> 
> I'm wondering if we used inline assembly for the __rw_atomic_* functions
> if the cost would be reduced. We could also evaluate the intrinsic
> pragma that is available on MSVC.
> 
> Travis
> 
>> -----Original Message-----
>>
>> I will do a quick run using the string performance test after lunch.
>> I'll report the results on that later. I've pasted the source for the
>> bulk of my test below. If someone wants the entire thing, let me know
>> and I'll provide everything.
>>
>> Travis
>>

RE: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Travis Vitek <tv...@quovadx.com>.

Since we don't have a string perf test that I could find, I wrote up a
quick and dirty one that just made many copies of the same string
repeatedly to exercise the atomic increment/decrement. The results show
a 3% performance penalty when using the newer atomic functions. This
test was run with an 8d configuration, so the atomic functions were
compiled into the stdcxx dll. The test hardware is a Lenovo T60p [Intel
Core 2 T7600 2.33GHz CPU, 2GB RAM].

  Old                new [patched]
  ------  1 threads  ------  1 threads
  ms            714  ms            737
  ms/op  0.00004256  ms/op  0.00004393
  ------  2 threads  ------  2 threads
  ms           3911  ms           4024
  ms/op  0.00023311  ms/op  0.00023985
  ------  4 threads  ------  4 threads
  ms           7660  ms           7865
  ms/op  0.00045657  ms/op  0.00046879
  ------  8 threads  ------  8 threads
  ms          15192  ms          15585
  ms/op  0.00090551  ms/op  0.00092894

I'm wondering if we used inline assembly for the __rw_atomic_* functions
if the cost would be reduced. We could also evaluate the intrinsic
pragma that is available on MSVC.

Travis

>-----Original Message-----
>
>I will do a quick run using the string performance test after lunch.
>I'll report the results on that later. I've pasted the source for the
>bulk of my test below. If someone wants the entire thing, let me know
>and I'll provide everything.
>
>Travis
>

RE: [PATCH] Use __rw_atomic_xxx() on Windows

Posted by Travis Vitek <tv...@quovadx.com>.

Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
There is a performance penalty...

  C:\Temp>t 2 && t 4 && t 8
  ---------- locked inc ---- atomic_add ---- 2 threads
  ms               4266            4469
  ms/op      0.00003178      0.00003330      -4.7586%
  thr ms          18117           18437
  thr ms/op  0.00013498      0.00013737      -1.7663%
  ---------- locked inc ---- atomic_add ---- 4 threads
  ms               7969            8609
  ms/op      0.00005937      0.00006414      -8.0311%
  thr ms          36359           37019
  thr ms/op  0.00027090      0.00027581      -1.8152%
  ---------- locked inc ---- atomic_add ---- 8 threads
  ms               5016            5484
  ms/op      0.00003737      0.00004086      -9.3301%
  thr ms          60846           66130
  thr ms/op  0.00045334      0.00049271      -8.6842%

  C:\Temp>t 2 && t 4 && t 8
  ---------- locked inc ---- atomic_add ---- 2 threads
  ms               2781            2906
  ms/op      0.00002072      0.00002165      -4.4948%
  thr ms          14961           16093
  thr ms/op  0.00011147      0.00011990      -7.5663%
  ---------- locked inc ---- atomic_add ---- 4 threads
  ms               2781            2891
  ms/op      0.00002072      0.00002154      -3.9554%
  thr ms          30867           31328
  thr ms/op  0.00022998      0.00023341      -1.4935%
  ---------- locked inc ---- atomic_add ---- 8 threads
  ms               2782            2890
  ms/op      0.00002073      0.00002153      -3.8821%
  thr ms          64318           64341
  thr ms/op  0.00047921      0.00047938      -0.0358%

I will do a quick run using the string performance test after lunch.
I'll report the results on that later. I've pasted the source for the
bulk of my test below. If someone wants the entire thing, let me know
and I'll provide everything.

Travis


Martin Sebor wrote:
>Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>
>What's the status of this? We need to decide if we can put this
>in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
>sure the new functions don't cause a performance regression in
>basic_string. I.e., we need to see the before and after numbers.
>
>Martin
>
>Martin Sebor wrote:
>>
>> One concern I have is performance. Does replacing the intrinsics with
>> out of line function call whose semantics the compiler has no idea
>> about have any impact on the runtime efficiency of the 
>generated code?
>> I would be especially interested in "real life" scenarios such as the
>> usage of the atomic operations in basic_string.
>> 
>> It would be good to see some before and after numbers. If you don't
>> have all the platforms to run the test post your benchmark and Travis
>> can help you put them together.
>

#include <stdio.h>
#include <stdlib.h>

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <process.h>

#include "lib.h"

#define MIN_THREADS 2
#define MAX_THREADS 16

unsigned long locked_inc(long* val, long iters)
{
    const unsigned long t0 = GetTickCount ();

    long n;
    for (n = 0; n < iters; ++n)
    {
        InterlockedIncrement(val);
    }

    const unsigned long t1 = GetTickCount ();

    return (t1 - t0);
}

unsigned long atomic_add(long* val, long iters)
{
    const unsigned long t0 = GetTickCount ();

    long n;
    for (n = 0; n < iters; ++n)
    {
        __rw_atomic_add32(val, 1);
    }

    const unsigned long t1 = GetTickCount ();

    return (t1 - t0);
}

struct thread_param {

    // atomic variable
    long* variable;

    // number of iterations
    long iters;

    // function to invoke
    unsigned long (*fun)(long*, long);

    // result of function
    unsigned long result;

    // thread handle used by main thread
    HANDLE thread;
};

extern "C" {

    void thread_func(void* p)
    {
        thread_param* param = (thread_param*)p;
        param->result = (param->fun)(param->variable, param->iters);
    }

} // extern "C"


unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
long), long iters)
{
    thread_param params[MAX_THREADS];
    long thread_var = 0;

    int i;
    for (i = 0; i < nthreads; ++i) {
        params[i].variable = &thread_var;
        params[i].result   = 0;
        params[i].fun      = fun;
        params[i].iters    = iters;
    }

    int n;
    for (n = 0; n < nthreads; ++n) {
        params[n].thread = (HANDLE)_beginthread(thread_func, 0,
&params[n]);
    }

    unsigned long thread_time = 0;

    for (n = 0; n < nthreads; ++n) {
        WaitForSingleObject (params[n].thread, INFINITE);
        thread_time += params[n].result;
    }

    return thread_time;
}


int main(int argc, char* argv[])
{
    int nthreads = MIN_THREADS;
    if (1 < argc)
        nthreads = atoi(argv[1]);

    // cap thread count
    if (nthreads < MIN_THREADS)
        nthreads = MIN_THREADS;
    else if (MAX_THREADS < nthreads)
        nthreads = MAX_THREADS;

    const long ops = 0x7ffffff;
    long thread_var;
    
    thread_var = 0;
    unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
    
    thread_var = 0;
    unsigned long atomic_add_ms = atomic_add (&thread_var, ops);

    printf("---------- locked inc ---- atomic_add ---- %d threads\n",
nthreads);
    printf("ms           %8.u        %8.u\n", locked_inc_ms,
atomic_add_ms);

    float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
    float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;

    printf("ms/op      %8.8f      %8.8f      %.4f%%\n", 
        locked_inc_ops_p_ms, atomic_add_ops_p_ms,
        100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);

    // do it with threads

    locked_inc_ms = run_threads(nthreads, locked_inc, ops);
    atomic_add_ms = run_threads(nthreads, atomic_add, ops);

    locked_inc_ms /= nthreads;
    atomic_add_ms /= nthreads;

    printf("thr ms       %8.u        %8.u\n", locked_inc_ms,
atomic_add_ms);

    locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
    atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;

     printf("thr ms/op  %8.8f      %8.8f      %.4f%%\n", 
        locked_inc_ops_p_ms, atomic_add_ops_p_ms,
        100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);

    return 0;
}