You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@stdcxx.apache.org by Martin Sebor <se...@roguewave.com> on 2007/09/05 03:14:19 UTC
Re: [PATCH] Use __rw_atomic_xxx() on Windows
What's the status of this? We need to decide if we can put this
in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
sure the new functions don't cause a performance regression in
basic_string. I.e., we need to see the before and after numbers.
Martin
Martin Sebor wrote:
> One concern I have is performance. Does replacing the intrinsics with
> out of line function call whose semantics the compiler has no idea
> about have any impact on the runtime efficiency of the generated code?
> I would be especially interested in "real life" scenarios such as the
> usage of the atomic operations in basic_string.
>
> It would be good to see some before and after numbers. If you don't
> have all the platforms to run the test post your benchmark and Travis
> can help you put them together.
>
> FYI, in case you're not aware of this (it's not immediately obvious),
> even though we define the full set of atomic operations (i.e., for all
> integer types) the library only uses the overloads for int and long.
>
> Martin
>
> Farid Zaripov wrote:
>> Attached is a patch, adding __rw_atomic_{add|xchg}xx() functions on
>> Win32/Win64 platforms.
>>
>> ChangeLog:
>> * msvc-7.0.config: Added AS config variable.
>> * msvc-8.0-x64.config: Ditto.
>> * filterdef.js: Added definition of the CustomFileDef class
>> * projectdef.js (InitAsmTool): New function to init custom build rule
>> for .asm files.
>> * projects.js: Added definitions of the platform dependent files.
>> * utilities.js: Read AS configuration variable from the .config file.
>> * i86/atomic.asm: New file with definitions of the __rw_atomic_xxx()
>> for Win32 platform.
>> * i86_64/atomic.asm: New file with definitions of the
>> __rw_atomic_xxx() for Windows/x64 platform.
>> * _mutex.h: Removed all dependencies on InterlockedXXX() API functions.
>> Use new __rw_atomic_xxx() functions instead of InterlockedXXX().
>> * once.cpp [_WIN32 && _DLL]: Tell linker to export __atomic_xxx()
>> functions, defined in .asm files.
>>
>> Farid.
>>
>>
>> ------------------------------------------------------------------------
>>
>> Index: etc/config/windows/filterdef.js
>> ===================================================================
>> --- etc/config/windows/filterdef.js (revision 570339)
>> +++ etc/config/windows/filterdef.js (working copy)
>> @@ -25,7 +25,7 @@
>>
>> var sourceFilterName = "Source Files";
>> var sourceFilterUuid = "{4FC737F1-C7A5-4376-A066-2A32D752A2FF}";
>> -var sourceFilterExts = ".cpp;.cxx;.s";
>> +var sourceFilterExts = ".cpp;.cxx;.s;.asm";
>>
>> var headerFilterName = "Header Files";
>> var headerFilterUuid = "{93995380-89BD-4b04-88EB-625FBE52EBFB}";
>> @@ -56,6 +56,21 @@
>> return str;
>> }
>>
>> +//------------------------------------------------
>> +// CustomFileDef class
>> +//------------------------------------------------
>> +
>> +// CustomFileDef .ctor
>> +function CustomFileDef(filepath, platform, initfun)
>> +{
>> + this.filepath = filepath;
>> + this.platform = platform;
>> + this.initfun = initfun;
>> +}
>> +
>> +// global array with platform dependent files definitions
>> +var customFileDefs = new Array();
>> +
>> // common macros
>> var cmnMacros = new Array();
>>
>> @@ -126,7 +141,29 @@
>> var VCFile = filter.AddFile(filename);
>> if (null != filetype && typeof(VCFile.FileType) != "undefined")
>> VCFile.FileType = filetype;
>> - +
>> + var customFileDef = null;
>> +
>> + if (!exclude)
>> + {
>> + // find the platform dependent file definition
>> + for (var i = 0; i < customFileDefs.length; ++i)
>> + {
>> + var custFileDef = customFileDefs[i];
>> + var pos = VCFile.FullPath.length -
>> custFileDef.filepath.length;
>> + if (0 <= pos && pos ==
>> VCFile.FullPath.indexOf(custFileDef.filepath))
>> + {
>> + customFileDef = custFileDef;
>> + break;
>> + }
>> + }
>> +
>> + // exclude this file from build if current platform
>> + // is not custom file target platform
>> + if (null != customFileDef && customFileDef.platform != PLATFORM)
>> + exclude = true;
>> + }
>> + if (exclude)
>> {
>> var cfgs = VCFile.FileConfigurations;
>> @@ -144,6 +181,12 @@
>> cfg.ExcludedFromBuild = exclude;
>> }
>> }
>> + else if (null != customFileDef &&
>> + "undefined" != typeof(customFileDef.initfun))
>> + {
>> + // init
>> + customFileDef.initfun(VCFile);
>> + }
>> }
>>
>> // create VCFilter object from the FilterDef definition
>> Index: etc/config/windows/msvc-7.0.config
>> ===================================================================
>> --- etc/config/windows/msvc-7.0.config (revision 570339)
>> +++ etc/config/windows/msvc-7.0.config (working copy)
>> @@ -38,6 +38,7 @@
>> CXX=cl
>> LD=cl
>> AR=lib
>> +AS=ml
>>
>> // Use singlethreaded or mutlithreaded CRT in 11s, 11d solution
>> configurations
>> // 0 for MS VisualStudio .NET and MS VisualStudio .NET 2003
>> Index: etc/config/windows/msvc-8.0-x64.config
>> ===================================================================
>> --- etc/config/windows/msvc-8.0-x64.config (revision 570339)
>> +++ etc/config/windows/msvc-8.0-x64.config (working copy)
>> @@ -1,2 +1,3 @@
>> #include msvc-8.0
>> PLATFORM=x64
>> +AS=ml64
>> Index: etc/config/windows/projectdef.js
>> ===================================================================
>> --- etc/config/windows/projectdef.js (revision 570339)
>> +++ etc/config/windows/projectdef.js (working copy)
>> @@ -941,3 +941,25 @@
>>
>> return projectDef;
>> }
>> +
>> +// init custom build rule for .asm files
>> +function InitAsmTool(VCFile)
>> +{
>> + var cfgs = VCFile.FileConfigurations;
>> + for (var i = 1; i <= cfgs.Count; ++i)
>> + {
>> + var cfg = cfgs.Item(i);
>> + if ((typeof(cfg.Tool.ToolKind) != "undefined" &&
>> + cfg.Tool.ToolKind != "VCCustomBuildTool") ||
>> + cfg.Tool.ToolName != "Custom Build Tool")
>> + {
>> + cfg.Tool =
>> cfg.ProjectConfiguration.FileTools.Item("VCCustomBuildTool");
>> + }
>> +
>> + var tool = cfg.Tool;
>> + tool.Description = "Compiling .asm file...";
>> + tool.Outputs = "$(IntDir)\\$(InputName).obj";
>> + tool.CommandLine = AS + " /c /nologo /Fo" + tool.Outputs +
>> + " /W3 /Zi /Ta" + VCFile.RelativePath;
>> + }
>> +}
>> Index: etc/config/windows/projects.js
>> ===================================================================
>> --- etc/config/windows/projects.js (revision 570339)
>> +++ etc/config/windows/projects.js (working copy)
>> @@ -84,6 +84,10 @@
>> projectDefs.push(new Array(configureDef));
>>
>> ///////////////////////////////////////////////////////////////////////////////
>>
>> + // add platform dependent files
>> + customFileDefs.push(new CustomFileDef("i86\\atomic.asm", "Win32",
>> InitAsmTool));
>> + customFileDefs.push(new CustomFileDef("i86_64\\atomic.asm",
>> "x64", InitAsmTool));
>> +
>> var stdcxxDef = new ProjectDef(".stdcxx", typeLibrary);
>> stdcxxDef.VCProjDir = ProjectsDir;
>> stdcxxDef.FilterDefs.push(
>> Index: etc/config/windows/utilities.js
>> ===================================================================
>> --- etc/config/windows/utilities.js (revision 570339)
>> +++ etc/config/windows/utilities.js (working copy)
>> @@ -32,6 +32,7 @@
>> var CXX = "cl";
>> var LD = "cl";
>> var AR = "lib";
>> +var AS = "ml";
>> var SLNVER="8.00";
>> var SLNCOMMENT="";
>> var UNICODELOG = false;
>> @@ -112,6 +113,9 @@
>> case "AR":
>> AR = arr[2];
>> break;
>> + case "AS":
>> + AS = arr[2];
>> + break;
>> case "SLNVER":
>> SLNVER = arr[2];
>> break;
>> @@ -179,6 +183,7 @@
>> stream.WriteLine(" CXX=" + CXX);
>> stream.WriteLine(" LD=" + LD);
>> stream.WriteLine(" AR=" + AR);
>> + stream.WriteLine(" AS=" + AS);
>> stream.WriteLine(" SLNVER=" + SLNVER);
>> stream.WriteLine(" SLNCOMMENT=" + SLNCOMMENT);
>> stream.WriteLine(" UNICODELOG=" + UNICODELOG);
>> Index: include/rw/_mutex.h
>> ===================================================================
>> --- include/rw/_mutex.h (revision 570431)
>> +++ include/rw/_mutex.h (working copy)
>> @@ -140,15 +140,7 @@
>> __declspec (dllimport) void __stdcall
>> DeleteCriticalSection (_RTL_CRITICAL_SECTION*);
>>
>> -__declspec (dllimport) long __stdcall
>> -InterlockedIncrement (_RWSTD_INTERLOCKED_T*);
>> -
>> -__declspec (dllimport) long __stdcall
>> -InterlockedDecrement (_RWSTD_INTERLOCKED_T*);
>> -
>> -__declspec (dllimport) long __stdcall
>> -InterlockedExchange (_RWSTD_INTERLOCKED_T*, long);
>> -
>> +_RWSTD_EXPORT int __rw_atomic_add32 (int*, int);
>> } // extern "C"
>>
>> _RWSTD_NAMESPACE (__rw) { @@ -478,11 +470,11 @@
>> // implicit initialization used to prevent a g++ 2.95.2 warning
>> on Tru64
>> // sorry: semantics of inline function static data are wrong
>> (you'll wind
>> // up with multiple copies)
>> - static volatile long __cntr /* = 0 */; // initialization counter
>> + static volatile int __cntr /* = 0 */; // initialization counter
>>
>> #if defined (_WIN32) || defined (_WIN64)
>> // MT safe
>> - if (0 == __cntr && 1 == InterlockedIncrement ((long*)&__cntr))
>> + if (0 == __cntr && 1 == __rw_atomic_add32 ((int*)&__cntr, +1))
>> #else
>> // not so safe (volatile should help)
>> if (0 == __cntr && 1 == ++__cntr)
>> @@ -1161,19 +1153,20 @@
>> false);
>> }
>> -/********************** i386/gcc **************************************/
>> +/********************** i386/gcc || _M_IX86
>> *********************************/
>>
>> -#elif defined (__i386__) && (defined (__GNUG__) || defined
>> (__INTEL_COMPILER))
>> +#elif defined (__i386__) && (defined (__GNUG__) \
>> + || defined (__INTEL_COMPILER)) || defined (_M_IX86)
>>
>> extern "C" {
>>
>> -char __rw_atomic_add8 (char*, int);
>> -short __rw_atomic_add16 (short*, short);
>> -int __rw_atomic_add32 (int*, int);
>> +_RWSTD_EXPORT char __rw_atomic_add8 (char*, int);
>> +_RWSTD_EXPORT short __rw_atomic_add16 (short*, short);
>> +_RWSTD_EXPORT int __rw_atomic_add32 (int*, int);
>>
>> -char __rw_atomic_xchg8 (char*, char);
>> -short __rw_atomic_xchg16 (short*, short);
>> -int __rw_atomic_xchg32 (int*, int);
>> +_RWSTD_EXPORT char __rw_atomic_xchg8 (char*, char);
>> +_RWSTD_EXPORT short __rw_atomic_xchg16 (short*, short);
>> +_RWSTD_EXPORT int __rw_atomic_xchg32 (int*, int);
>>
>> } // extern "C"
>>
>> @@ -1349,85 +1342,39 @@
>> _RWSTD_STATIC_CAST (int, __y));
>> }
>>
>> +/********************** IA64/x86_64/_M_X64
>> *****************************/
>>
>> -/********************** WIN 32/64 ************************************/
>> +#elif defined (__ia64) || defined (__x86_64) || defined (_M_X64)
>>
>> -#elif defined (_WIN32)
>> +extern "C" {
>>
>> -// Interlocked[In|De]crement functions atomically modify their argument
>> -// and return the new value
>> +_RWSTD_EXPORT _RWSTD_INT8_T
>> +__rw_atomic_xchg8 (_RWSTD_INT8_T*, _RWSTD_INT8_T);
>>
>> -// InterlockedExchange atomically sets the value pointed to by the first
>> -// argument to that of the second argument and returns the original
>> value
>> +_RWSTD_EXPORT _RWSTD_INT16_T
>> +__rw_atomic_xchg16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>>
>> -inline int
>> -__rw_atomic_preincrement (int &__x, bool)
>> -{
>> - _RWSTD_COMPILE_ASSERT (sizeof __x == sizeof (long));
>> - return InterlockedIncrement (_RWSTD_REINTERPRET_CAST (long*, &__x));
>> -}
>> +_RWSTD_EXPORT _RWSTD_INT32_T
>> +__rw_atomic_xchg32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>>
>>
>> -inline unsigned int
>> -__rw_atomic_preincrement (unsigned int &__x, bool)
>> -{
>> - return __rw_atomic_preincrement (_RWSTD_REINTERPRET_CAST (int&,
>> __x),
>> - false);
>> -}
>> +_RWSTD_EXPORT _RWSTD_INT8_T
>> +__rw_atomic_add8 (_RWSTD_INT8_T*, _RWSTD_INT8_T);
>>
>> +_RWSTD_EXPORT _RWSTD_INT16_T
>> +__rw_atomic_add16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>>
>> -inline int
>> -__rw_atomic_predecrement (int &__x, bool)
>> -{
>> - _RWSTD_COMPILE_ASSERT (sizeof __x == sizeof (long));
>> - return InterlockedDecrement (_RWSTD_REINTERPRET_CAST (long*, &__x));
>> -}
>> +_RWSTD_EXPORT _RWSTD_INT32_T
>> +__rw_atomic_add32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>>
>> -
>> -inline unsigned int
>> -__rw_atomic_predecrement (unsigned int &__x, bool)
>> -{
>> - return __rw_atomic_predecrement (_RWSTD_REINTERPRET_CAST (int&,
>> __x),
>> - false);
>> -}
>> -
>> -
>> -inline int
>> -__rw_atomic_exchange (int &__x, int __y, bool)
>> -{
>> - _RWSTD_COMPILE_ASSERT (sizeof __x == sizeof (long));
>> - return InterlockedExchange (_RWSTD_REINTERPRET_CAST (long*, &__x),
>> - _RWSTD_STATIC_CAST (long, __y));
>> -}
>> -
>> -
>> -inline unsigned int
>> -__rw_atomic_exchange (unsigned int &__x, unsigned int __y, bool)
>> -{
>> - return __rw_atomic_exchange (_RWSTD_REINTERPRET_CAST (int&, __x),
>> - _RWSTD_STATIC_CAST (int, __y), false);
>> -}
>> -
>> -/********************** IA64/x86_64 ***********************************/
>> -
>> -#elif defined (__ia64) || defined (__x86_64)
>> -
>> -extern "C" {
>> -
>> -_RWSTD_INT8_T __rw_atomic_xchg8 (_RWSTD_INT8_T*, _RWSTD_INT8_T);
>> -_RWSTD_INT16_T __rw_atomic_xchg16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>> -_RWSTD_INT32_T __rw_atomic_xchg32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>> -
>> -
>> -_RWSTD_INT8_T __rw_atomic_add8 (_RWSTD_INT8_T*, _RWSTD_INT8_T);
>> -_RWSTD_INT16_T __rw_atomic_add16 (_RWSTD_INT16_T*, _RWSTD_INT16_T);
>> -_RWSTD_INT32_T __rw_atomic_add32 (_RWSTD_INT32_T*, _RWSTD_INT32_T);
>> -
>> #ifdef _RWSTD_INT64_T
>>
>> -_RWSTD_INT64_T __rw_atomic_xchg64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>> -_RWSTD_INT64_T __rw_atomic_add64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>> +_RWSTD_EXPORT _RWSTD_INT64_T
>> +__rw_atomic_xchg64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>>
>> +_RWSTD_EXPORT _RWSTD_INT64_T
>> +__rw_atomic_add64 (_RWSTD_INT64_T*, _RWSTD_INT64_T);
>> +
>> #endif // _RWSTD_INT64_T
>>
>> } // extern "C"
>> Index: src/i86/atomic.asm
>> ===================================================================
>> --- src/i86/atomic.asm (revision 0)
>> +++ src/i86/atomic.asm (revision 0)
>> @@ -0,0 +1,178 @@
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +;
>> +; i86/atomic.asm
>> +;
>> +; $Id$
>> +;
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +;
>> +; Licensed to the Apache Software Foundation (ASF) under one or more
>> +; contributor license agreements. See the NOTICE file distributed
>> +; with this work for additional information regarding copyright
>> +; ownership. The ASF licenses this file to you under the Apache
>> +; License, Version 2.0 (the "License"); you may not use this file
>> +; except in compliance with the License. You may obtain a copy of
>> +; the License at
>> +;
>> +; http://www.apache.org/licenses/LICENSE-2.0
>> +;
>> +; Unless required by applicable law or agreed to in writing, software
>> +; distributed under the License is distributed on an "AS IS" BASIS,
>> +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> +; implied. See the License for the specific language governing
>> +; permissions and limitations under the License.
>> +;
>> +; Copyright 2003-2006 Rogue Wave Software.
>> +;
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> +
>> + .486
>> + .model flat
>> + .code
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" char __rw_atomic_xchg8 (char *x, char y);
>> +;
>> +; Atomically assigns the 8-bit value y to *x and returns
>> +; the original (before assignment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 4
>> + public ___rw_atomic_xchg8
>> +___rw_atomic_xchg8 proc ; char (char *x, char y)
>> +
>> +arg_x = dword ptr 4
>> +arg_y = byte ptr 8
>> + + mov
>> ecx, [esp+arg_x] ; %ecx = x
>> + mov al, [esp+arg_y] ; %al = y
>> + xchg al, [ecx] ; %al <-> (%ecx)
>> + ret +___rw_atomic_xchg8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" short __rw_atomic_xchg16 (short *x, short y);
>> +;
>> +; Atomically assigns the 16-bit value y to *x and returns
>> +; the original (before assignment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 4
>> + public ___rw_atomic_xchg16
>> +___rw_atomic_xchg16 proc ; short (short *x, short y)
>> +
>> +arg_x = dword ptr 4
>> +arg_y = word ptr 8
>> + + mov
>> ecx, [esp+arg_x] ; %ecx = x +
>> mov ax, [esp+arg_y] ; %eax = y
>> + xchg ax, [ecx] ; %ax <->
>> (%ecx) + ret
>> +___rw_atomic_xchg16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int __rw_atomic_xchg32 (int *x, int y);
>> +;
>> +; Atomically assigns the 32-bit value y to *x and returns
>> +; the original (before assignment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 4
>> + public ___rw_atomic_xchg32
>> +___rw_atomic_xchg32 proc ; int (int *x, int y)
>> +
>> +arg_x = dword ptr 4
>> +arg_y = dword ptr 8
>> + + mov
>> ecx, [esp+arg_x] ; %ecx = x
>> + mov eax, [esp+arg_y] ; %eax = y
>> + xchg eax, [ecx] ; %eax <-> (%ecx)
>> + ret
>> +___rw_atomic_xchg32 endp
>> +
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" char __rw_atomic_add8 (char *x, int y);
>> +;
>> +; Atomically increments the 8-bit value *x by y and returns
>> +; the new (after increment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 4
>> + public ___rw_atomic_add8
>> +___rw_atomic_add8 proc ; char (char *dst, int inc)
>> +
>> +arg_dst = dword ptr 4
>> +arg_inc = dword ptr 8
>> + + mov
>> ecx, [esp+arg_dst] ; %ecx = dst +
>> mov eax, [esp+arg_inc] ; %eax = inc
>> + mov edx, eax +
>> + lock xadd [ecx], al ; tmp = *dst;
>> + ; dst += inc;
>> + ; %al =
>> tmp +
>> + add eax, edx ; return %eax +
>> inc + ret
>> +___rw_atomic_add8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" short __rw_atomic_add16 (short *x, short y);
>> +;
>> +; Atomically increments the 16-bit value *x by y and returns
>> +; the new (after increment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 4
>> + public ___rw_atomic_add16
>> +___rw_atomic_add16 proc ; short (short *dst,
>> short inc)
>> +
>> +arg_dst = dword ptr 4
>> +arg_inc = dword ptr 8
>> +
>> + mov ecx, [esp+arg_dst] ; %ecx =
>> dst + mov eax, [esp+arg_inc] ; %eax
>> = inc + mov edx, eax
>> + + lock xadd
>> [ecx], ax ; tmp = *dst;
>> + ; dst += inc;
>> + ; %ax =
>> tmp +
>> + add eax, edx ; return %eax +
>> inc + ret
>> +___rw_atomic_add16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int __rw_atomic_add32 (int *x, int y);
>> +;
>> +; Atomically increments the 32-bit value *x by y and returns
>> +; the new (after increment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 4
>> + public ___rw_atomic_add32
>> +___rw_atomic_add32 proc ; int (int *dst, int inc)
>> +
>> +arg_dst = dword ptr 4
>> +arg_inc = dword ptr 8
>> +
>> + mov ecx, [esp+arg_dst] ; %ecx =
>> dst + mov edx, [esp+arg_inc] ; %edx
>> = inc + mov eax, edx
>> + + lock xadd
>> [ecx], eax ; tmp = *dst;
>> + ; dst += inc;
>> + ; %eax =
>> tmp +
>> + add eax, edx ; return %eax +
>> inc + ret
>> +___rw_atomic_add32 endp
>> +
>> + end
>>
>> Property changes on: src\i86\atomic.asm
>> ___________________________________________________________________
>> Name: svn:keywords
>> + Id
>> Name: svn:eol-style
>> + native
>>
>> Index: src/i86_64/atomic.asm
>> ===================================================================
>> --- src/i86_64/atomic.asm (revision 0)
>> +++ src/i86_64/atomic.asm (revision 0)
>> @@ -0,0 +1,186 @@
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +;
>> +; i86_64/atomic.asm
>> +;
>> +; $Id$
>> +;
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +;
>> +; Licensed to the Apache Software Foundation (ASF) under one or more
>> +; contributor license agreements. See the NOTICE file distributed
>> +; with this work for additional information regarding copyright
>> +; ownership. The ASF licenses this file to you under the Apache
>> +; License, Version 2.0 (the "License"); you may not use this file
>> +; except in compliance with the License. You may obtain a copy of
>> +; the License at
>> +;
>> +; http://www.apache.org/licenses/LICENSE-2.0
>> +;
>> +; Unless required by applicable law or agreed to in writing, software
>> +; distributed under the License is distributed on an "AS IS" BASIS,
>> +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> +; implied. See the License for the specific language governing
>> +; permissions and limitations under the License.
>> +;
>> +; Copyright 2003-2006 Rogue Wave Software.
>> +;
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> +
>> + .code
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int8_t __rw_atomic_xchg8 (int8_t *x, int8_t y);
>> +;
>> +; Atomically assigns the 8-bit value y to *x and returns
>> +; the original (before assignment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_xchg8
>> +__rw_atomic_xchg8 proc ; int8_t (int8_t *x, int8_t y)
>> + ; %rcx = x
>> + mov al, dl ; %al = y +
>> xchg al, [rcx] ; %al <-> (%rcx)
>> + ret
>> +__rw_atomic_xchg8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int16_t __rw_atomic_xchg16 (int16_t *x, int16_t y);
>> +;
>> +; Atomically assigns the 16-bit value y to *x and returns
>> +; the original (before assignment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_xchg16
>> +__rw_atomic_xchg16 proc ; int16_t (int16_t *x,
>> int16_t y)
>> + ; %rcx = x
>> + mov ax, dx ; %ax = y
>> + xchg ax, [rcx] ; %ax <-> (%rcx)
>> + ret
>> +__rw_atomic_xchg16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int32_t __rw_atomic_xchg32 (int32_t *x, int32_t y);
>> +;
>> +; Atomically assigns the 32-bit value y to *x and returns
>> +; the original (before assignment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_xchg32
>> +__rw_atomic_xchg32 proc ; int32_t (int32_t *x,
>> int32_t y)
>> + ; %rcx = x
>> + mov eax, edx ; %eax = y
>> + xchg eax, [rcx] ; %eax <-> (%rcx)
>> + ret
>> +__rw_atomic_xchg32 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int32_t __rw_atomic_xchg64 (int64_t *x, int64_t y);
>> +;
>> +; Atomically assigns the 64-bit value y to *x and returns
>> +; the original (before assignment) 64-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_xchg64
>> +__rw_atomic_xchg64 proc ; int64_t (int64_t *x,
>> int64_t y)
>> + ; %rcx = x
>> + mov rax, rdx ; %rax = y
>> + xchg rax, [rcx] ; %rax <-> (%rcx)
>> + ret
>> +__rw_atomic_xchg64 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int8_t __rw_atomic_add8 (int8_t *x, int8_t y);
>> +;
>> +; Atomically increments the 8-bit value *x by y and returns
>> +; the new (after increment) 8-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_add8
>> +__rw_atomic_add8 proc ; int8_t (int8_t *dst,
>> int8_t inc)
>> + ; %rcx = dst
>> + mov eax, edx ; %eax = inc
>> +
>> + lock xadd [rcx], al ; tmp = *dst
>> + ; dst += inc
>> + ; %al = tmp
>> + add eax, edx ; return %al + inc
>> + ret
>> +__rw_atomic_add8 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int16_t __rw_atomic_add16 (int16_t *x, int16_t y);
>> +;
>> +; Atomically increments the 16-bit value *x by y and returns
>> +; the new (after increment) 16-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_add16
>> +__rw_atomic_add16 proc ; int16_t (int16_t *dst,
>> int16_t inc)
>> + ; %rcx = dst
>> + mov ax, dx ; %ax = inc +
>> + lock xadd [rcx], ax ; tmp = *dst
>> + ; dst += inc
>> + ; eax = tmp +
>> + add ax, dx ; return %ax + inc
>> + ret
>> +__rw_atomic_add16 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int32_t __rw_atomic_add32 (int32_t *x, int32_t y);
>> +;
>> +; Atomically increments the 32-bit value *x by y and returns
>> +; the new (after increment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_add32
>> +__rw_atomic_add32 proc ; int32_t (int32_t *dst,
>> int32_t inc)
>> + ; %rcx = dst
>> + mov eax, edx ; %eax = inc
>> +
>> + lock xadd [rcx], eax ; tmp = *dst
>> + ; dst += inc
>> + ; %eax = tmp
>> +
>> + add eax, edx ; return %eax + inc
>> + ret
>> +__rw_atomic_add32 endp
>> +
>> +
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +; extern "C" int64_t __rw_atomic_add64 (int64_t *x, int64_t y);
>> +;
>> +; Atomically increments the 32-bit value *x by y and returns
>> +; the new (after increment) 32-bit value of *x.
>> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>>
>> +
>> + align 16
>> + public __rw_atomic_add64
>> +__rw_atomic_add64 proc ; int64_t (int64_t *dst,
>> int64_t inc)
>> + ; %rcx = dst
>> + mov rax, rdx ; %eax = inc
>> +
>> + lock xadd [rcx], rax ; tmp = *dst
>> + ; dst += inc
>> + ; %eax = tmp
>> +
>> + add rax, rdx ; return %eax + inc
>> + ret
>> +__rw_atomic_add64 endp
>> +
>> + end
>>
>> Property changes on: src\i86_64\atomic.asm
>> ___________________________________________________________________
>> Name: svn:keywords
>> + Id
>> Name: svn:eol-style
>> + native
>>
>> Index: src/once.cpp
>> ===================================================================
>> --- src/once.cpp (revision 570339)
>> +++ src/once.cpp (working copy)
>> @@ -188,3 +188,32 @@
>> } // extern "C"
>>
>> } // namespace __rw
>> +
>> +// export __rw_atomic_xxx() functions, defined in atomic.asm
>> +#if defined (_WIN32) && defined (_DLL)
>> +
>> +# if defined (_M_IX86)
>> +
>> +# pragma comment(linker, "/EXPORT:___rw_atomic_add8")
>> +# pragma comment(linker, "/EXPORT:___rw_atomic_add16")
>> +# pragma comment(linker, "/EXPORT:___rw_atomic_add32")
>> +# pragma comment(linker, "/EXPORT:___rw_atomic_xchg8")
>> +# pragma comment(linker, "/EXPORT:___rw_atomic_xchg16")
>> +# pragma comment(linker, "/EXPORT:___rw_atomic_xchg32")
>> +
>> +# elif defined (_M_X64)
>> +
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_add8")
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_add16")
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_add32")
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_xchg8")
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_xchg16")
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_xchg32")
>> +
>> +# ifdef _RWSTD_INT64_T
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_add64")
>> +# pragma comment(linker, "/EXPORT:__rw_atomic_xchg64")
>> +# endif // _RWSTD_INT64_T
>> +# endif // _M_IX86
>> +
>> +#endif // _WIN32 && _DLL
>
Re: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Martin Sebor <se...@roguewave.com>.
Farid Zaripov wrote:
>> -----Original Message-----
>> From: Martin Sebor [mailto:sebor@roguewave.com]
>> Sent: Thursday, September 06, 2007 5:49 AM
>> To: stdcxx-dev@incubator.apache.org
>> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>>
>> Travis Vitek wrote:
>>> Oh, yeah. that is the other thing that I did Friday. I wrote a
>>> testcase to compare __rw_atomic_add32() against
>> InterlockedIncrement() on Win32.
>>> There is a performance penalty...
>> I'd be curious to know if the performance penalty is due to
>> the function call overhead or something else.
>>
>> In any case though, I think we could tweak the patch and
>> change the __rw_atomic_pre{de,in}crement() overloads for int
>> and long to call the appropriate Interlocked{De,In}crement()
>> intrinsics and have the other overloads use the new ones.
>>
>> Farid, what do you think about this approach?
>
> I agree. And I decided to make the changes above
> without making the tests to see the performance penalty.
Okay, that sounds like a good approach to me. You can still
commit the int and long atomic functions, we just won't call
them.
Martin
RE: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Farid Zaripov <Fa...@epam.com>.
> -----Original Message-----
> From: Martin Sebor [mailto:sebor@roguewave.com]
> Sent: Thursday, September 06, 2007 5:49 AM
> To: stdcxx-dev@incubator.apache.org
> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>
> Travis Vitek wrote:
> > Oh, yeah. that is the other thing that I did Friday. I wrote a
> > testcase to compare __rw_atomic_add32() against
> InterlockedIncrement() on Win32.
> > There is a performance penalty...
>
> I'd be curious to know if the performance penalty is due to
> the function call overhead or something else.
>
> In any case though, I think we could tweak the patch and
> change the __rw_atomic_pre{de,in}crement() overloads for int
> and long to call the appropriate Interlocked{De,In}crement()
> intrinsics and have the other overloads use the new ones.
>
> Farid, what do you think about this approach?
I agree. And I decided to make the changes above
without making the tests to see the performance penalty.
Farid.
Re: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Martin Sebor <se...@roguewave.com>.
Travis Vitek wrote:
> Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
> to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
> There is a performance penalty...
I'd be curious to know if the performance penalty is due to the
function call overhead or something else.
In any case though, I think we could tweak the patch and change
the __rw_atomic_pre{de,in}crement() overloads for int and long
to call the appropriate Interlocked{De,In}crement() intrinsics
and have the other overloads use the new ones.
Farid, what do you think about this approach?
Martin
>
> C:\Temp>t 2 && t 4 && t 8
> ---------- locked inc ---- atomic_add ---- 2 threads
> ms 4266 4469
> ms/op 0.00003178 0.00003330 -4.7586%
> thr ms 18117 18437
> thr ms/op 0.00013498 0.00013737 -1.7663%
> ---------- locked inc ---- atomic_add ---- 4 threads
> ms 7969 8609
> ms/op 0.00005937 0.00006414 -8.0311%
> thr ms 36359 37019
> thr ms/op 0.00027090 0.00027581 -1.8152%
> ---------- locked inc ---- atomic_add ---- 8 threads
> ms 5016 5484
> ms/op 0.00003737 0.00004086 -9.3301%
> thr ms 60846 66130
> thr ms/op 0.00045334 0.00049271 -8.6842%
>
> C:\Temp>t 2 && t 4 && t 8
> ---------- locked inc ---- atomic_add ---- 2 threads
> ms 2781 2906
> ms/op 0.00002072 0.00002165 -4.4948%
> thr ms 14961 16093
> thr ms/op 0.00011147 0.00011990 -7.5663%
> ---------- locked inc ---- atomic_add ---- 4 threads
> ms 2781 2891
> ms/op 0.00002072 0.00002154 -3.9554%
> thr ms 30867 31328
> thr ms/op 0.00022998 0.00023341 -1.4935%
> ---------- locked inc ---- atomic_add ---- 8 threads
> ms 2782 2890
> ms/op 0.00002073 0.00002153 -3.8821%
> thr ms 64318 64341
> thr ms/op 0.00047921 0.00047938 -0.0358%
>
> I will do a quick run using the string performance test after lunch.
> I'll report the results on that later. I've pasted the source for the
> bulk of my test below. If someone wants the entire thing, let me know
> and I'll provide everything.
>
> Travis
>
>
> Martin Sebor wrote:
>> Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>>
>> What's the status of this? We need to decide if we can put this
>> in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
>> sure the new functions don't cause a performance regression in
>> basic_string. I.e., we need to see the before and after numbers.
>>
>> Martin
>>
>> Martin Sebor wrote:
>>> One concern I have is performance. Does replacing the intrinsics with
>>> out of line function call whose semantics the compiler has no idea
>>> about have any impact on the runtime efficiency of the
>> generated code?
>>> I would be especially interested in "real life" scenarios such as the
>>> usage of the atomic operations in basic_string.
>>>
>>> It would be good to see some before and after numbers. If you don't
>>> have all the platforms to run the test post your benchmark and Travis
>>> can help you put them together.
>
> #include <stdio.h>
> #include <stdlib.h>
>
> #define WIN32_LEAN_AND_MEAN
> #include <windows.h>
> #include <process.h>
>
> #include "lib.h"
>
> #define MIN_THREADS 2
> #define MAX_THREADS 16
>
> unsigned long locked_inc(long* val, long iters)
> {
> const unsigned long t0 = GetTickCount ();
>
> long n;
> for (n = 0; n < iters; ++n)
> {
> InterlockedIncrement(val);
> }
>
> const unsigned long t1 = GetTickCount ();
>
> return (t1 - t0);
> }
>
> unsigned long atomic_add(long* val, long iters)
> {
> const unsigned long t0 = GetTickCount ();
>
> long n;
> for (n = 0; n < iters; ++n)
> {
> __rw_atomic_add32(val, 1);
> }
>
> const unsigned long t1 = GetTickCount ();
>
> return (t1 - t0);
> }
>
> struct thread_param {
>
> // atomic variable
> long* variable;
>
> // number of iterations
> long iters;
>
> // function to invoke
> unsigned long (*fun)(long*, long);
>
> // result of function
> unsigned long result;
>
> // thread handle used by main thread
> HANDLE thread;
> };
>
> extern "C" {
>
> void thread_func(void* p)
> {
> thread_param* param = (thread_param*)p;
> param->result = (param->fun)(param->variable, param->iters);
> }
>
> } // extern "C"
>
>
> unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
> long), long iters)
> {
> thread_param params[MAX_THREADS];
> long thread_var = 0;
>
> int i;
> for (i = 0; i < nthreads; ++i) {
> params[i].variable = &thread_var;
> params[i].result = 0;
> params[i].fun = fun;
> params[i].iters = iters;
> }
>
> int n;
> for (n = 0; n < nthreads; ++n) {
> params[n].thread = (HANDLE)_beginthread(thread_func, 0,
> ¶ms[n]);
> }
>
> unsigned long thread_time = 0;
>
> for (n = 0; n < nthreads; ++n) {
> WaitForSingleObject (params[n].thread, INFINITE);
> thread_time += params[n].result;
> }
>
> return thread_time;
> }
>
>
> int main(int argc, char* argv[])
> {
> int nthreads = MIN_THREADS;
> if (1 < argc)
> nthreads = atoi(argv[1]);
>
> // cap thread count
> if (nthreads < MIN_THREADS)
> nthreads = MIN_THREADS;
> else if (MAX_THREADS < nthreads)
> nthreads = MAX_THREADS;
>
> const long ops = 0x7ffffff;
> long thread_var;
>
> thread_var = 0;
> unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
>
> thread_var = 0;
> unsigned long atomic_add_ms = atomic_add (&thread_var, ops);
>
> printf("---------- locked inc ---- atomic_add ---- %d threads\n",
> nthreads);
> printf("ms %8.u %8.u\n", locked_inc_ms,
> atomic_add_ms);
>
> float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
> float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
>
> printf("ms/op %8.8f %8.8f %.4f%%\n",
> locked_inc_ops_p_ms, atomic_add_ops_p_ms,
> 100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
> locked_inc_ops_p_ms);
>
> // do it with threads
>
> locked_inc_ms = run_threads(nthreads, locked_inc, ops);
> atomic_add_ms = run_threads(nthreads, atomic_add, ops);
>
> locked_inc_ms /= nthreads;
> atomic_add_ms /= nthreads;
>
> printf("thr ms %8.u %8.u\n", locked_inc_ms,
> atomic_add_ms);
>
> locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
> atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
>
> printf("thr ms/op %8.8f %8.8f %.4f%%\n",
> locked_inc_ops_p_ms, atomic_add_ops_p_ms,
> 100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
> locked_inc_ops_p_ms);
>
> return 0;
> }
Re: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Martin Sebor <se...@roguewave.com>.
Travis Vitek wrote:
> Doh! I should know better. Here is the results from a 12d build on the
> same hardware.
Does this mean that there is almost no difference between the
intrinsic functions and the out of line ones, or that the test
is too simple to demonstrate them?
I expect the greatest advantage of the intrinsics over ordinary
out-of-line functions to be that they (might) make it possible
for the optimizer to generate better code *in certain contexts*
depending on from where they are called. This is going to be
hard to demonstrate in a simple test case. I suspect we would
need a more realistic test with a number of different uses of
string (and the atomic functions) to get some idea of how much
they might help.
Martin
>
> normal patched
> ------ 1 threads ------ 1 threads
> ms 934 ms 1015
> ms/op 0.00005567 ms/op 0.00006050
> ------ 2 threads ------ 2 threads
> ms 6049 ms 6266
> ms/op 0.00036055 ms/op 0.00037348
> ------ 4 threads ------ 4 threads
> ms 11948 ms 11813
> ms/op 0.00071216 ms/op 0.00070411
> ------ 8 threads ------ 8 threads
> ms 23855 ms 24743
> ms/op 0.00142187 ms/op 0.00147480
>
>
>
> Martin Sebor wrote:
>> 8d is not thread-safe so the atomic function templates should
>> be implemented in terms of ordinary increments and decrements
>> (if they aren't it's a bug). They should only expand to the
>> atomic assembly (or the Win32 Interlocked) functions in 12X
>> and 15X build types.
>>
>> Martin
>>
RE: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Travis Vitek <tv...@quovadx.com>.
Doh! I should know better. Here is the results from a 12d build on the
same hardware.
normal patched
------ 1 threads ------ 1 threads
ms 934 ms 1015
ms/op 0.00005567 ms/op 0.00006050
------ 2 threads ------ 2 threads
ms 6049 ms 6266
ms/op 0.00036055 ms/op 0.00037348
------ 4 threads ------ 4 threads
ms 11948 ms 11813
ms/op 0.00071216 ms/op 0.00070411
------ 8 threads ------ 8 threads
ms 23855 ms 24743
ms/op 0.00142187 ms/op 0.00147480
Martin Sebor wrote:
>
>8d is not thread-safe so the atomic function templates should
>be implemented in terms of ordinary increments and decrements
>(if they aren't it's a bug). They should only expand to the
>atomic assembly (or the Win32 Interlocked) functions in 12X
>and 15X build types.
>
>Martin
>
Re: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Martin Sebor <se...@roguewave.com>.
Travis Vitek wrote:
> Since we don't have a string perf test that I could find, I wrote up a
> quick and dirty one that just made many copies of the same string
> repeatedly to exercise the atomic increment/decrement. The results show
> a 3% performance penalty when using the newer atomic functions. This
> test was run with an 8d configuration, so the atomic functions were
> compiled into the stdcxx dll. The test hardware is a Lenovo T60p [Intel
> Core 2 T7600 2.33GHz CPU, 2GB RAM].
8d is not thread-safe so the atomic function templates should
be implemented in terms of ordinary increments and decrements
(if they aren't it's a bug). They should only expand to the
atomic assembly (or the Win32 Interlocked) functions in 12X
and 15X build types.
Martin
>
> Old new [patched]
> ------ 1 threads ------ 1 threads
> ms 714 ms 737
> ms/op 0.00004256 ms/op 0.00004393
> ------ 2 threads ------ 2 threads
> ms 3911 ms 4024
> ms/op 0.00023311 ms/op 0.00023985
> ------ 4 threads ------ 4 threads
> ms 7660 ms 7865
> ms/op 0.00045657 ms/op 0.00046879
> ------ 8 threads ------ 8 threads
> ms 15192 ms 15585
> ms/op 0.00090551 ms/op 0.00092894
>
> I'm wondering if we used inline assembly for the __rw_atomic_* functions
> if the cost would be reduced. We could also evaluate the intrinsic
> pragma that is available on MSVC.
>
> Travis
>
>> -----Original Message-----
>>
>> I will do a quick run using the string performance test after lunch.
>> I'll report the results on that later. I've pasted the source for the
>> bulk of my test below. If someone wants the entire thing, let me know
>> and I'll provide everything.
>>
>> Travis
>>
RE: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Travis Vitek <tv...@quovadx.com>.
Since we don't have a string perf test that I could find, I wrote up a
quick and dirty one that just made many copies of the same string
repeatedly to exercise the atomic increment/decrement. The results show
a 3% performance penalty when using the newer atomic functions. This
test was run with an 8d configuration, so the atomic functions were
compiled into the stdcxx dll. The test hardware is a Lenovo T60p [Intel
Core 2 T7600 2.33GHz CPU, 2GB RAM].
Old new [patched]
------ 1 threads ------ 1 threads
ms 714 ms 737
ms/op 0.00004256 ms/op 0.00004393
------ 2 threads ------ 2 threads
ms 3911 ms 4024
ms/op 0.00023311 ms/op 0.00023985
------ 4 threads ------ 4 threads
ms 7660 ms 7865
ms/op 0.00045657 ms/op 0.00046879
------ 8 threads ------ 8 threads
ms 15192 ms 15585
ms/op 0.00090551 ms/op 0.00092894
I'm wondering if we used inline assembly for the __rw_atomic_* functions
if the cost would be reduced. We could also evaluate the intrinsic
pragma that is available on MSVC.
Travis
>-----Original Message-----
>
>I will do a quick run using the string performance test after lunch.
>I'll report the results on that later. I've pasted the source for the
>bulk of my test below. If someone wants the entire thing, let me know
>and I'll provide everything.
>
>Travis
>
RE: [PATCH] Use __rw_atomic_xxx() on Windows
Posted by Travis Vitek <tv...@quovadx.com>.
Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
There is a performance penalty...
C:\Temp>t 2 && t 4 && t 8
---------- locked inc ---- atomic_add ---- 2 threads
ms 4266 4469
ms/op 0.00003178 0.00003330 -4.7586%
thr ms 18117 18437
thr ms/op 0.00013498 0.00013737 -1.7663%
---------- locked inc ---- atomic_add ---- 4 threads
ms 7969 8609
ms/op 0.00005937 0.00006414 -8.0311%
thr ms 36359 37019
thr ms/op 0.00027090 0.00027581 -1.8152%
---------- locked inc ---- atomic_add ---- 8 threads
ms 5016 5484
ms/op 0.00003737 0.00004086 -9.3301%
thr ms 60846 66130
thr ms/op 0.00045334 0.00049271 -8.6842%
C:\Temp>t 2 && t 4 && t 8
---------- locked inc ---- atomic_add ---- 2 threads
ms 2781 2906
ms/op 0.00002072 0.00002165 -4.4948%
thr ms 14961 16093
thr ms/op 0.00011147 0.00011990 -7.5663%
---------- locked inc ---- atomic_add ---- 4 threads
ms 2781 2891
ms/op 0.00002072 0.00002154 -3.9554%
thr ms 30867 31328
thr ms/op 0.00022998 0.00023341 -1.4935%
---------- locked inc ---- atomic_add ---- 8 threads
ms 2782 2890
ms/op 0.00002073 0.00002153 -3.8821%
thr ms 64318 64341
thr ms/op 0.00047921 0.00047938 -0.0358%
I will do a quick run using the string performance test after lunch.
I'll report the results on that later. I've pasted the source for the
bulk of my test below. If someone wants the entire thing, let me know
and I'll provide everything.
Travis
Martin Sebor wrote:
>Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>
>What's the status of this? We need to decide if we can put this
>in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
>sure the new functions don't cause a performance regression in
>basic_string. I.e., we need to see the before and after numbers.
>
>Martin
>
>Martin Sebor wrote:
>>
>> One concern I have is performance. Does replacing the intrinsics with
>> out of line function call whose semantics the compiler has no idea
>> about have any impact on the runtime efficiency of the
>generated code?
>> I would be especially interested in "real life" scenarios such as the
>> usage of the atomic operations in basic_string.
>>
>> It would be good to see some before and after numbers. If you don't
>> have all the platforms to run the test post your benchmark and Travis
>> can help you put them together.
>
#include <stdio.h>
#include <stdlib.h>
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <process.h>
#include "lib.h"
#define MIN_THREADS 2
#define MAX_THREADS 16
unsigned long locked_inc(long* val, long iters)
{
const unsigned long t0 = GetTickCount ();
long n;
for (n = 0; n < iters; ++n)
{
InterlockedIncrement(val);
}
const unsigned long t1 = GetTickCount ();
return (t1 - t0);
}
unsigned long atomic_add(long* val, long iters)
{
const unsigned long t0 = GetTickCount ();
long n;
for (n = 0; n < iters; ++n)
{
__rw_atomic_add32(val, 1);
}
const unsigned long t1 = GetTickCount ();
return (t1 - t0);
}
struct thread_param {
// atomic variable
long* variable;
// number of iterations
long iters;
// function to invoke
unsigned long (*fun)(long*, long);
// result of function
unsigned long result;
// thread handle used by main thread
HANDLE thread;
};
extern "C" {
void thread_func(void* p)
{
thread_param* param = (thread_param*)p;
param->result = (param->fun)(param->variable, param->iters);
}
} // extern "C"
unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
long), long iters)
{
thread_param params[MAX_THREADS];
long thread_var = 0;
int i;
for (i = 0; i < nthreads; ++i) {
params[i].variable = &thread_var;
params[i].result = 0;
params[i].fun = fun;
params[i].iters = iters;
}
int n;
for (n = 0; n < nthreads; ++n) {
params[n].thread = (HANDLE)_beginthread(thread_func, 0,
¶ms[n]);
}
unsigned long thread_time = 0;
for (n = 0; n < nthreads; ++n) {
WaitForSingleObject (params[n].thread, INFINITE);
thread_time += params[n].result;
}
return thread_time;
}
int main(int argc, char* argv[])
{
int nthreads = MIN_THREADS;
if (1 < argc)
nthreads = atoi(argv[1]);
// cap thread count
if (nthreads < MIN_THREADS)
nthreads = MIN_THREADS;
else if (MAX_THREADS < nthreads)
nthreads = MAX_THREADS;
const long ops = 0x7ffffff;
long thread_var;
thread_var = 0;
unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
thread_var = 0;
unsigned long atomic_add_ms = atomic_add (&thread_var, ops);
printf("---------- locked inc ---- atomic_add ---- %d threads\n",
nthreads);
printf("ms %8.u %8.u\n", locked_inc_ms,
atomic_add_ms);
float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
printf("ms/op %8.8f %8.8f %.4f%%\n",
locked_inc_ops_p_ms, atomic_add_ops_p_ms,
100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);
// do it with threads
locked_inc_ms = run_threads(nthreads, locked_inc, ops);
atomic_add_ms = run_threads(nthreads, atomic_add, ops);
locked_inc_ms /= nthreads;
atomic_add_ms /= nthreads;
printf("thr ms %8.u %8.u\n", locked_inc_ms,
atomic_add_ms);
locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
printf("thr ms/op %8.8f %8.8f %.4f%%\n",
locked_inc_ops_p_ms, atomic_add_ops_p_ms,
100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);
return 0;
}