You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@subversion.apache.org by David Glasser <gl...@davidglasser.net> on 2009/10/13 19:45:34 UTC

Re: svn commit: r39997 - trunk/tools/server-side

Is it legit to hash on a struct? Couldn't there be padding bytes?

--dave

On Oct 13, 2009 12:27 PM, "Daniel Shahaf" <ti...@danielsh.fastmail.net>
wrote:

Author: danielsh
Date: Tue Oct 13 12:19:00 2009
New Revision: 39997

Log:
In svn-rep-sharing-stats, count the number of shares actually happening
rather than the total potential number of shares.  (This matters when
rep-sharing wasn't enabled during the entire history of the filesystem.)

As a side effect, refactor the use of hashes to use proper keys, values,
allocations, and counters; counters overloaded into pointers-to-char are
gone

* tools/server-side/svn-rep-sharing-stats.c
 (INITIAL_VALUE):  Remove.
 (struct key_t, struct value_t):  New.
 (record):  Use key_t and value_t instead of checksum cstring and
   unsigned int cast to char *.
 (process_one_revision, pretty_print):  Update docstrings' description of
   the hashes.
 (pretty_print):  Fetch keys/values from the hash as key_t and value_t.

Modified:
  trunk/tools/server-side/svn-rep-sharing-stats.c

Modified: trunk/tools/server-side/svn-rep-sharing-stats.c
URL:
http://svn.collab.net/viewvc/svn/trunk/tools/server-side/svn-rep-sharing-stats.c?pathrev=39997&r1=39996&r2=39997
==============================================================================
--- trunk/tools/server-side/svn-rep-sharing-stats.c     Tue Oct 13 10:49:57
2009        (r39996)
+++ trunk/tools/server-side/svn-rep-sharing-stats.c     Tue Oct 13 12:19:00
2009        (r39997)
@@ -160,13 +160,6 @@ enum {
  OPT_BOTH
 };

-/* In the hashes, the reference count is stored inside the value-pointer.
- * The value (unsigned int)u is stored as (char *)(INITIAL_VALUE + u),
- * where INITIAL_VALUE is the value returned by apr_hash_get() on an empty
- * hash:
- */
-#define INITIAL_VALUE NULL
-
 static svn_error_t *check_experimental(void)
 {
  if (getenv("SVN_REP_SHARING_STATS_IS_EXPERIMENTAL"))
@@ -177,14 +170,29 @@ static svn_error_t *check_experimental(v
                          "be used on live data.");
 }

-/* Increment records[rep->sha1_checksum] if all of them are non-NULL.
- * If necessary, allocate the cstring key in RESULT_POOL.  */
+/* The parts of a rep that determine whether it's being shared. */
+struct key_t
+{
+  svn_revnum_t revision;
+  apr_off_t offset;
+};
+
+/* What we need to know about a rep. */
+struct value_t
+{
+  svn_checksum_t *sha1_checksum;
+  apr_uint64_t refcount;
+};
+
+/* Increment records[rep] if both are non-NULL and REP contains a sha1.
+ * Allocate keys and values in RESULT_POOL.
+ */
 static svn_error_t *record(apr_hash_t *records,
                           representation_t *rep,
                           apr_pool_t *result_pool)
 {
-  const char *cstring;
-  char *oldvalue, *newvalue;
+  struct key_t *key;
+  struct value_t *value;

  /* Skip if we ignore this particular kind of reps, or if the rep doesn't
   * exist or doesn't have the checksum we are after.  (The latter case
@@ -193,20 +201,31 @@ static svn_error_t *record(apr_hash_t *r
  if (records == NULL || rep == NULL || rep->sha1_checksum == NULL)
    return SVN_NO_ERROR;

-  cstring = svn_checksum_to_cstring_display(rep->sha1_checksum,
result_pool);
-  /* TODO: hash should be keyed not on checksum alone; reps with same key
-   *       are not necessarily shared */
-  oldvalue = apr_hash_get(records, cstring, APR_HASH_KEY_STRING);
-  newvalue = oldvalue + 1;
-  apr_hash_set(records, cstring, APR_HASH_KEY_STRING, newvalue);
+  /* Construct the key. */
+  key = apr_palloc(result_pool, sizeof(*key));
+  key->revision = rep->revision;
+  key->offset = rep->offset;
+
+  /* Update or create the value. */
+  if (value = apr_hash_get(records, key, sizeof(*key)))
+    {
+      value->refcount++;
+    }
+  else
+    {
+      value = apr_palloc(result_pool, sizeof(*value));
+      value->sha1_checksum = svn_checksum_dup(rep->sha1_checksum,
result_pool);
+      value->refcount = 1;
+    }
+
+  /* Store them. */
+  apr_hash_set(records, key, sizeof(*key), value);

  return SVN_NO_ERROR;
 }

 /* Inspect the data and/or prop reps of revision REVNUM in FS.  Store
- * reference count tallies in passed hashes (allocated in RESULT_POOL),
- * as maps of const char * checksum cstrings to unsigned ints, represented
- * as in the documentation of INITIAL_VALUE.
+ * reference count tallies in passed hashes (allocated in RESULT_POOL).
 *
 * If PROP_REPS or DATA_REPS is NULL, the respective kind of reps are not
 * tallied.
@@ -276,8 +295,9 @@ process_one_revision(svn_fs_t *fs,
 }

 /* Print REPS_REF_COUNT (a hash as for process_one_revision())
- * to stdout in "value => key" format (for sorting, since the keys
- * are just sha1's).  Prepend each line by NAME.
+ * to stdout in "refcount => sha1" format.  A sha1 may appear
+ * more than once if not all its instances are shared.  Prepend
+ * each line by NAME.
 *
 * Use SCRATCH_POOL for temporary allocations.
 */
@@ -294,12 +314,18 @@ pretty_print(const char *name,
  for (hi = apr_hash_first(scratch_pool, reps_ref_counts);
       hi; hi = apr_hash_next(hi))
    {
-      const char *sha1_cstring = svn_apr_hash_index_key(hi);
-      unsigned int ref_count = (char *)svn_apr_hash_index_val(hi) -
INITIAL_VALUE;
+      const struct key_t *key;
+      struct value_t *value;

      SVN_ERR(cancel_func(NULL));
-      SVN_ERR(svn_cmdline_printf(scratch_pool, "%s %u %s\n",
-                                 name, ref_count, sha1_cstring));
+
+      key = svn_apr_hash_index_key(hi);
+      value = svn_apr_hash_index_val(hi);
+      SVN_ERR(svn_cmdline_printf(scratch_pool, "%s %" APR_UINT64_T_FMT "
%s\n",
+                                 name, value->refcount,
+                                 svn_checksum_to_cstring_display(
+                                   value->sha1_checksum,
+                                   scratch_pool)));
    }

  return SVN_NO_ERROR;

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=495&dsMessageId=2407287

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407296

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
David Glasser wrote on Tue, 13 Oct 2009 at 17:15 -0700:
> While you're at it, maybe you want to check that the shas match when
> you're incrementing the reccount?

r40007.

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407445

Re: svn commit: r39997 - trunk/tools/server-side

Posted by David Glasser <gl...@davidglasser.net>.
On Tue, Oct 13, 2009 at 1:07 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> Daniel Shahaf wrote on Tue, 13 Oct 2009 at 22:03 +0200:
>> David Glasser wrote on Tue, 13 Oct 2009 at 12:45 -0700:
>> > On Oct 13, 2009 12:27 PM, "Daniel Shahaf" <ti...@danielsh.fastmail.net>
>> > wrote:
>> >
>> > > +struct key_t
>> > > +{
>> > > +  svn_revnum_t revision;
>> > > +  apr_off_t offset;
>> > > +};
>> > > +
>> > > +/* What we need to know about a rep. */
>> > > +struct value_t
>> > > +{
>> > > +  svn_checksum_t *sha1_checksum;
>> > > +  apr_uint64_t refcount;
>> > > +};
>> ...
>> > > +  apr_hash_set(records, key, sizeof(*key), value);
>> >
>> > Is it legit to hash on a struct? Couldn't there be padding bytes?
>
> Should I use apr_pcalloc() to allocate the key_t structs?

Hmm, that would probably be fine.

While you're at it, maybe you want to check that the shas match when
you're incrementing the reccount?

--dave

-- 
glasser@davidglasser.net | langtonlabs.org | flickr.com/photos/glasser/

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407366

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Daniel Shahaf wrote on Tue, 13 Oct 2009 at 22:03 +0200:
> David Glasser wrote on Tue, 13 Oct 2009 at 12:45 -0700:
> > On Oct 13, 2009 12:27 PM, "Daniel Shahaf" <ti...@danielsh.fastmail.net>
> > wrote:
> > 
> > > +struct key_t
> > > +{
> > > +  svn_revnum_t revision;
> > > +  apr_off_t offset;
> > > +};
> > > +
> > > +/* What we need to know about a rep. */
> > > +struct value_t
> > > +{
> > > +  svn_checksum_t *sha1_checksum;
> > > +  apr_uint64_t refcount;
> > > +};
> ...
> > > +  apr_hash_set(records, key, sizeof(*key), value);
> > 
> > Is it legit to hash on a struct? Couldn't there be padding bytes?

Should I use apr_pcalloc() to allocate the key_t structs?

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407310

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Branko Cibej wrote on Wed, 14 Oct 2009 at 06:41 +0200:
> Daniel Shahaf wrote:
> > David Glasser wrote on Tue, 13 Oct 2009 at 12:45 -0700:
> >> On Oct 13, 2009 12:27 PM, "Daniel Shahaf" <ti...@danielsh.fastmail.net> wrote:
> >>> +  apr_hash_set(records, key, sizeof(*key), value);
> >>>       
> >> Is it legit to hash on a struct? Couldn't there be padding bytes?
> >>     
> >
> > While this doesn't answer the question, we have precedent in 
> > libsvn_fs_base/bdb/env.c:
> 
> ... but notice that bdb_env_key_t is part of bdb_env_t which has a
> constructor called create_env() that uses calloc() to allocate the
> struct. Yes, there can be padding, and yes, using (apr_p)calloc to
> allocate the struct avoids hashing random padding bits.
> 

Good to know.  Fixed in r40006, thanks.

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407439

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Branko Cibej <br...@xbc.nu>.
Branko Čibej wrote:
> Philip Martin wrote:
>   
>> Branko Cibej <br...@xbc.nu> writes:
>>
>>   
>>     
>>> struct. Yes, there can be padding, and yes, using (apr_p)calloc to
>>> allocate the struct avoids hashing random padding bits.
>>>     
>>>       
>> Is that correct?  When code writes to a struct can it change the
>> padding bits?  I'm not aware of any requirement on the compiler to
>> avoid changing those bits.  Consider a machine where writing 32-bit
>> values is faster than writing smaller values: a compiler may choose to
>> use 32-bit writes to change a 16-bit value when the other 16-bits are
>> padding; the padding bits would then get random values.
>>   
>>     
> There is a requirement that any optimization have no side-effects that a
> program can detect. Widening a write the way you propose would certainly
> be detected with memcmp(); so I believe that, while a compiler is free
> to generate wider reads, or use wider arithmetic where overflow is not
> affected, etc., it's not allowed to generate wider writes that would
> affect padding.
>
> At least, that's my understanding.
>   

By the way, I believe the compiler *is* allowed to widen write to
stack-based variables, because any kind of fiddling with reading stack
padding is not well-defined; even its existence can't be determined in a
conformant way, since results of arithmetic on unrelated pointers are
undefined. This is quite unrelated to the fact that it does work, and
that valgrind et al. would scream at you if you changed stack padding.

-- Brane

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407483

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Philip Martin <ph...@codematters.co.uk>.
Branko Čibej <br...@xbc.nu> writes:

> Philip Martin wrote:
>> Branko Cibej <br...@xbc.nu> writes:
>>
>>   
>>> struct. Yes, there can be padding, and yes, using (apr_p)calloc to
>>> allocate the struct avoids hashing random padding bits.
>>>     
>>
>> Is that correct?  When code writes to a struct can it change the
>> padding bits?  I'm not aware of any requirement on the compiler to
>> avoid changing those bits.  Consider a machine where writing 32-bit
>> values is faster than writing smaller values: a compiler may choose to
>> use 32-bit writes to change a 16-bit value when the other 16-bits are
>> padding; the padding bits would then get random values.
>>   
> There is a requirement that any optimization have no side-effects that a
> program can detect. Widening a write the way you propose would certainly
> be detected with memcmp(); so I believe that, while a compiler is free
> to generate wider reads, or use wider arithmetic where overflow is not
> affected, etc., it's not allowed to generate wider writes that would
> affect padding.

A conforming program cannot rely on the value of the padding bytes so
I don't think the optimisation argument works.  C99 standard states:

  6.2.6.1/6

  When a value is stored in an object of structure or union type,
  including in a member object, the bytes of the object representation
  that correspond to any padding bytes take unspecified values.

I think that allows writes to the padding bytes irrespective of any
optimisations.

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407487

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Branko Cibej <br...@xbc.nu>.
Philip Martin wrote:
> Branko Cibej <br...@xbc.nu> writes:
>
>   
>> struct. Yes, there can be padding, and yes, using (apr_p)calloc to
>> allocate the struct avoids hashing random padding bits.
>>     
>
> Is that correct?  When code writes to a struct can it change the
> padding bits?  I'm not aware of any requirement on the compiler to
> avoid changing those bits.  Consider a machine where writing 32-bit
> values is faster than writing smaller values: a compiler may choose to
> use 32-bit writes to change a 16-bit value when the other 16-bits are
> padding; the padding bits would then get random values.
>   
There is a requirement that any optimization have no side-effects that a
program can detect. Widening a write the way you propose would certainly
be detected with memcmp(); so I believe that, while a compiler is free
to generate wider reads, or use wider arithmetic where overflow is not
affected, etc., it's not allowed to generate wider writes that would
affect padding.

At least, that's my understanding.

-- Brane

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407470

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Philip Martin <ph...@codematters.co.uk>.
Branko Cibej <br...@xbc.nu> writes:

> struct. Yes, there can be padding, and yes, using (apr_p)calloc to
> allocate the struct avoids hashing random padding bits.

Is that correct?  When code writes to a struct can it change the
padding bits?  I'm not aware of any requirement on the compiler to
avoid changing those bits.  Consider a machine where writing 32-bit
values is faster than writing smaller values: a compiler may choose to
use 32-bit writes to change a 16-bit value when the other 16-bits are
padding; the padding bits would then get random values.

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407466

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Branko Cibej <br...@xbc.nu>.
Daniel Shahaf wrote:
> David Glasser wrote on Tue, 13 Oct 2009 at 12:45 -0700:
>   
>> On Oct 13, 2009 12:27 PM, "Daniel Shahaf" <ti...@danielsh.fastmail.net>
>> wrote:
>>
>>     
>>> +struct key_t
>>> +{
>>> +  svn_revnum_t revision;
>>> +  apr_off_t offset;
>>> +};
>>> +
>>> +/* What we need to know about a rep. */
>>> +struct value_t
>>> +{
>>> +  svn_checksum_t *sha1_checksum;
>>> +  apr_uint64_t refcount;
>>> +};
>>>       
> ...
>   
>>> +  apr_hash_set(records, key, sizeof(*key), value);
>>>       
>> Is it legit to hash on a struct? Couldn't there be padding bytes?
>>     
>
> While this doesn't answer the question, we have precedent in 
> libsvn_fs_base/bdb/env.c:
>
>     apr_hash_set(bdb_cache, &bdb->key, sizeof bdb->key, NULL);
>
> where bdb->key is the following struct:
>
>     typedef struct
>     {
>       apr_dev_t device;
>       apr_ino_t inode;
>     } bdb_env_key_t;
>   

... but notice that bdb_env_key_t is part of bdb_env_t which has a
constructor called create_env() that uses calloc() to allocate the
struct. Yes, there can be padding, and yes, using (apr_p)calloc to
allocate the struct avoids hashing random padding bits.

-- Brane

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407402

Re: svn commit: r39997 - trunk/tools/server-side

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
David Glasser wrote on Tue, 13 Oct 2009 at 12:45 -0700:
> On Oct 13, 2009 12:27 PM, "Daniel Shahaf" <ti...@danielsh.fastmail.net>
> wrote:
> 
> > +struct key_t
> > +{
> > +  svn_revnum_t revision;
> > +  apr_off_t offset;
> > +};
> > +
> > +/* What we need to know about a rep. */
> > +struct value_t
> > +{
> > +  svn_checksum_t *sha1_checksum;
> > +  apr_uint64_t refcount;
> > +};
...
> > +  apr_hash_set(records, key, sizeof(*key), value);
> 
> Is it legit to hash on a struct? Couldn't there be padding bytes?

While this doesn't answer the question, we have precedent in 
libsvn_fs_base/bdb/env.c:

    apr_hash_set(bdb_cache, &bdb->key, sizeof bdb->key, NULL);

where bdb->key is the following struct:

    typedef struct
    {
      apr_dev_t device;
      apr_ino_t inode;
    } bdb_env_key_t;

------------------------------------------------------
http://subversion.tigris.org/ds/viewMessage.do?dsForumId=462&dsMessageId=2407307