GNU bug report logs - #13127
[PATCH] cut: use only one data strucutre

Date: Sun, 9 Dec 2012 10:29:01 UTC

Severity: normal

Tags: patch

Done: Pádraig Brady <P <at> draigBrady.com>

Bug is archived. No further changes may be made.

To add a comment to this bug, you must first unarchive it, by sending
a message to control AT debbugs.gnu.org, with unarchive 13127 in the body.
You can then email your comments to 13127 AT debbugs.gnu.org in the normal way.

Toggle the display of automated, internal messages from the tracker.

View this report as an mbox folder, status mbox, maintainer mbox

Report forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 09 Dec 2012 10:29:01 GMT) Full text and rfc822 format available.

Acknowledgement sent to xojoc <at> gmx.com:
New bug report received and forwarded. Copy sent to bug-coreutils <at> gnu.org. (Sun, 09 Dec 2012 10:29:01 GMT) Full text and rfc822 format available.

Message #5 received at submit <at> debbugs.gnu.org (full text, mbox):

From: Cojocaru Alexandru <xojoc <at> gmx.com>
To: bug-coreutils <at> gnu.org
Subject: [PATCH] cut: use only one data strucutre
Date: Sun, 9 Dec 2012 11:28:05 +0100

[Message part 1 (text/plain, inline)]

From 678c2ecfebbf7a278c14b7e6fcb815e87569cd20 Mon Sep 17 00:00:00 2001
From: Cojocaru Alexandru <xojoc <at> gmx.com>
Date: Sun, 9 Dec 2012 10:43:10 +0100
Subject: [PATCH] cut: use only one data structure

The current implementation of cut, uses a bit array,
an array of `struct range_pair's, and (when --output-delimiter
is specified) a hash_table. The new implementation will use
only an array of `struct range_pair's.
The old implementation is inefficient for the following reasons:
 1. When -b with a big num is specified, it allocates a lot of useless
    memory for `printable_field'.
 2. When --output-delimiter is specified, it will allocate 31 buckets.
    Even if a few ranges are specified.

* src/cut.c (set_fields): Set and initialize RP
instead of printable_field.
* src/cut.c (print_kth): Split it. Check *only* if a given field
or byte is printable.
* src/cut.c (is_range_start): New function.
* tests/misc/cut.pl: Check if `eol_range_start' is set correctly.
---
 src/cut.c         | 312 +++++++++++++++++++-----------------------------------
 tests/misc/cut.pl |   4 +
 2 files changed, 113 insertions(+), 203 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index de9320c..545639d 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -53,8 +53,31 @@
     }									\
   while (0)
 
+
+struct range_pair
+  {
+    size_t lo;
+    size_t hi;
+  };
+
+/* Array of `struct range_pair' holding all the finite ranges. */
+static struct range_pair *rp;
+
+/* Pointer inside RP. When checking if a byte or field is selected
+   by a finite range, we check if it is between CURRENT_RP.LO
+   and CURRENT_RP.HI. If the byte or field index is greater than
+   CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
+static struct range_pair *current_rp;
+
+/* Number of finite ranges specified by the user. */
+static size_t n_rp;
+
+/* Number of `struct range_pair's allocated. */
+static size_t n_rp_allocated;
+
+
 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
-   space if necessary.  Update local variable N_RP.  When allocating,
+   space if necessary.  Update global variable N_RP.  When allocating,
    update global variable N_RP_ALLOCATED.  */
 
 #define ADD_RANGE_PAIR(rp, low, high)			\
@@ -72,11 +95,6 @@
     }							\
   while (0)
 
-struct range_pair
-  {
-    size_t lo;
-    size_t hi;
-  };
 
 /* This buffer is used to support the semantics of the -s option
    (or lack of same) when the specified field list includes (does
@@ -90,26 +108,11 @@ static char *field_1_buffer;
 /* The number of bytes allocated for FIELD_1_BUFFER.  */
 static size_t field_1_bufsize;
 
-/* The largest field or byte index used as an endpoint of a closed
-   or degenerate range specification;  this doesn't include the starting
-   index of right-open-ended ranges.  For example, with either range spec
-   '2-5,9-', '2-3,5,9-' this variable would be set to 5.  */
-static size_t max_range_endpoint;
 
 /* If nonzero, this is the index of the first field in a range that goes
    to end of line. */
 static size_t eol_range_start;
 
-/* This is a bit vector.
-   In byte mode, which bytes to output.
-   In field mode, which DELIM-separated fields to output.
-   Both bytes and fields are numbered starting with 1,
-   so the zeroth bit of this array is unused.
-   A field or byte K has been selected if
-   (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
-    || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START).  */
-static unsigned char *printable_field;
-
 enum operating_mode
   {
     undefined_mode,
@@ -148,15 +151,6 @@ static char *output_delimiter_string;
 /* True if we have ever read standard input. */
 static bool have_read_stdin;
 
-#define HT_RANGE_START_INDEX_INITIAL_CAPACITY 31
-
-/* The set of range-start indices.  For example, given a range-spec list like
-   '-b1,3-5,4-9,15-', the following indices will be recorded here: 1, 3, 15.
-   Note that although '4' looks like a range-start index, it is in the middle
-   of the '3-5' range, so it doesn't count.
-   This table is created/used IFF output_delimiter_specified is set.  */
-static Hash_table *range_start_ht;
-
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -240,73 +234,33 @@ With no FILE, or when FILE is -, read standard input.\n\
   exit (status);
 }
 
-static inline void
-mark_range_start (size_t i)
-{
-  /* Record the fact that 'i' is a range-start index.  */
-  void *ent_from_table = hash_insert (range_start_ht, (void*) i);
-  if (ent_from_table == NULL)
-    {
-      /* Insertion failed due to lack of memory.  */
-      xalloc_die ();
-    }
-  assert ((size_t) ent_from_table == i);
-}
-
-static inline void
-mark_printable_field (size_t i)
-{
-  size_t n = i / CHAR_BIT;
-  printable_field[n] |= (1 << (i % CHAR_BIT));
-}
-
-static inline bool
-is_printable_field (size_t i)
-{
-  size_t n = i / CHAR_BIT;
-  return (printable_field[n] >> (i % CHAR_BIT)) & 1;
-}
-
-static size_t
-hash_int (const void *x, size_t tablesize)
-{
-#ifdef UINTPTR_MAX
-  uintptr_t y = (uintptr_t) x;
-#else
-  size_t y = (size_t) x;
-#endif
-  return y % tablesize;
-}
+/* Return nonzero if the K'th field or byte is printable. */
 
 static bool
-hash_compare_ints (void const *x, void const *y)
+print_kth (size_t k)
 {
-  return (x == y) ? true : false;
-}
+  bool k_selected = false;
+  if (0 < eol_range_start && eol_range_start <= k)
+    k_selected = true;
+  else if (current_rp->lo <= k && k <= current_rp->hi)
+    k_selected = true;
 
-static bool
-is_range_start_index (size_t i)
-{
-  return hash_lookup (range_start_ht, (void *) i) ? true : false;
+  return k_selected ^ complement;
 }
 
-/* Return nonzero if the K'th field or byte is printable.
-   When returning nonzero, if RANGE_START is non-NULL,
-   set *RANGE_START to true if K is the beginning of a range, and to
-   false otherwise.  */
+/* Return nonzero if K'th byte is the beginning of a range. */
 
-static bool
-print_kth (size_t k, bool *range_start)
+static inline bool
+is_range_start (size_t k)
 {
-  bool k_selected
-    = ((0 < eol_range_start && eol_range_start <= k)
-       || (k <= max_range_endpoint && is_printable_field (k)));
+  bool is_start = false;
 
-  bool is_selected = k_selected ^ complement;
-  if (range_start && is_selected)
-    *range_start = is_range_start_index (k);
+  if (!complement)
+    is_start = (k == eol_range_start || k == current_rp->lo);
+  else
+    is_start = (k == (current_rp - 1)->hi + 1);
 
-  return is_selected;
+  return is_start;
 }
 
 /* Comparison function for qsort to order the list of
@@ -319,24 +273,14 @@ compare_ranges (const void *a, const void *b)
   return a_start < b_start ? -1 : a_start > b_start;
 }
 
-/* Given the list of field or byte range specifications FIELDSTR, set
-   MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
-   array.  If there is a right-open-ended range, set EOL_RANGE_START
-   to its starting index.  FIELDSTR should be composed of one or more
-   numbers or ranges of numbers, separated by blanks or commas.
-   Incomplete ranges may be given: '-m' means '1-m'; 'n-' means 'n'
-   through end of line.  Return true if FIELDSTR contains at least
-   one field specification, false otherwise.  */
-
-/* FIXME-someday:  What if the user wants to cut out the 1,000,000-th
-   field of some huge input file?  This function shouldn't have to
-   allocate a table of a million bits just so we can test every
-   field < 10^6 with an array dereference.  Instead, consider using
-   an adaptive approach: if the range of selected fields is too large,
-   but only a few fields/byte-offsets are actually selected, use a
-   hash table.  If the range of selected fields is too large, and
-   too many are selected, then resort to using the range-pairs (the
-   'rp' array) directly.  */
+/* Given the list of field or byte range specifications FIELDSTR,
+   allocate and initialize the RP array. If there is a right-open-ended
+   range, set EOL_RANGE_START to its starting index. FIELDSTR should
+   be composed of one or more numbers or ranges of numbers, separated
+   by blanks or commas. Incomplete ranges may be given: '-m' means '1-m';
+   'n-' means 'n' through end of line.
+   Return true if FIELDSTR contains at least one field specification,
+   false otherwise.  */
 
 static bool
 set_fields (const char *fieldstr)
@@ -349,9 +293,6 @@ set_fields (const char *fieldstr)
   bool field_found = false;	/* True if at least one field spec
                                    has been processed.  */
 
-  struct range_pair *rp = NULL;
-  size_t n_rp = 0;
-  size_t n_rp_allocated = 0;
   size_t i;
   bool in_digits = false;
 
@@ -403,41 +344,10 @@ set_fields (const char *fieldstr)
                   if (value < initial)
                     FATAL_ERROR (_("invalid decreasing range"));
 
-                  /* Is there already a range going to end of line? */
-                  if (eol_range_start != 0)
-                    {
-                      /* Yes.  Is the new sequence already contained
-                         in the old one?  If so, no processing is
-                         necessary. */
-                      if (initial < eol_range_start)
-                        {
-                          /* No, the new sequence starts before the
-                             old.  Does the old range going to end of line
-                             extend into the new range?  */
-                          if (eol_range_start <= value)
-                            {
-                              /* Yes.  Simply move the end of line marker. */
-                              eol_range_start = initial;
-                            }
-                          else
-                            {
-                              /* No.  A simple range, before and disjoint from
-                                 the range going to end of line.  Fill it. */
-                              ADD_RANGE_PAIR (rp, initial, value);
-                            }
-
-                          /* In any case, some fields were selected. */
-                          field_found = true;
-                        }
-                    }
-                  else
-                    {
-                      /* There is no range going to end of line. */
-                      ADD_RANGE_PAIR (rp, initial, value);
-                      field_found = true;
-                    }
-                  value = 0;
+                  ADD_RANGE_PAIR (rp, initial, value);
+                  field_found = true;
                 }
+              value = 0;
             }
           else
             {
@@ -448,9 +358,7 @@ set_fields (const char *fieldstr)
             }
 
           if (*fieldstr == '\0')
-            {
-              break;
-            }
+            break;
 
           fieldstr++;
           lhs_specified = false;
@@ -494,47 +402,43 @@ set_fields (const char *fieldstr)
         FATAL_ERROR (_("invalid byte, character or field list"));
     }
 
-  max_range_endpoint = 0;
-  for (i = 0; i < n_rp; i++)
+  qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
+
+  /* Omit finite ranges subsumed by a to-EOL range. */
+  if (eol_range_start && n_rp)
     {
-      if (rp[i].hi > max_range_endpoint)
-        max_range_endpoint = rp[i].hi;
+      i = n_rp;
+      while (i && eol_range_start <= rp[i - 1].hi)
+        {
+          eol_range_start = MIN (rp[i - 1].lo, eol_range_start);
+          --n_rp;
+          --i;
+        }
     }
 
-  /* Allocate an array large enough so that it may be indexed by
-     the field numbers corresponding to all finite ranges
-     (i.e. '2-6' or '-4', but not '5-') in FIELDSTR.  */
-
-  if (max_range_endpoint)
-    printable_field = xzalloc (max_range_endpoint / CHAR_BIT + 1);
-
-  qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
-
-  /* Set the array entries corresponding to integers in the ranges of RP.  */
-  for (i = 0; i < n_rp; i++)
+  /* Merge finite range pairs (e.g. `2-5,3-4' becomes `2-5'). */
+  for (i = 0; i < n_rp; ++i)
     {
-      /* Ignore any range that is subsumed by the to-EOL range.  */
-      if (eol_range_start && eol_range_start <= rp[i].lo)
-        continue;
-
-      /* Record the range-start indices, i.e., record each start
-         index that is not part of any other (lo..hi] range.  */
-      size_t rsi_candidate = complement ? rp[i].hi + 1 : rp[i].lo;
-      if (output_delimiter_specified
-          && !is_printable_field (rsi_candidate))
-        mark_range_start (rsi_candidate);
-
-      for (size_t j = rp[i].lo; j <= rp[i].hi; j++)
-        mark_printable_field (j);
+      for (size_t j = i + 1; j < n_rp; ++j)
+        {
+          if (rp[j].lo <= rp[i].hi)
+            {
+              rp[i].hi = MAX (rp[j].hi, rp[i].hi);
+              memmove (rp + j, rp + j + 1,
+                       (n_rp - j - 1) * sizeof (struct range_pair));
+              --n_rp;
+            }
+          else
+            break;
+        }
     }
 
-  if (output_delimiter_specified
-      && !complement
-      && eol_range_start
-      && max_range_endpoint && !is_printable_field (eol_range_start))
-    mark_range_start (eol_range_start);
 
-  free (rp);
+  /* After merging, reallocate RP so we realise memory to the system.
+     Also add a sentinel at the end of RP, so we never get memory segfault. */
+  ++n_rp;
+  rp = xrealloc (rp, n_rp * sizeof (struct range_pair));
+  rp[n_rp - 1].lo = rp[n_rp - 1].hi = 0;
 
   return field_found;
 }
@@ -551,7 +455,8 @@ cut_bytes (FILE *stream)
 
   byte_idx = 0;
   print_delimiter = false;
-  while (1)
+  current_rp = rp;
+  while (true)
     {
       int c;		/* Each character from the file. */
 
@@ -562,6 +467,7 @@ cut_bytes (FILE *stream)
           putchar ('\n');
           byte_idx = 0;
           print_delimiter = false;
+          current_rp = rp;
         }
       else if (c == EOF)
         {
@@ -571,16 +477,21 @@ cut_bytes (FILE *stream)
         }
       else
         {
-          bool range_start;
-          bool *rs = output_delimiter_specified ? &range_start : NULL;
-          if (print_kth (++byte_idx, rs))
+          ++byte_idx;
+          if ((current_rp->hi < byte_idx) && (current_rp < rp + n_rp - 1))
+            ++current_rp;
+          if (print_kth (byte_idx))
             {
-              if (rs && *rs && print_delimiter)
+              if (output_delimiter_specified)
                 {
-                  fwrite (output_delimiter_string, sizeof (char),
-                          output_delimiter_length, stdout);
+                  if (print_delimiter && is_range_start (byte_idx))
+                    {
+                      fwrite (output_delimiter_string, sizeof (char),
+                              output_delimiter_length, stdout);
+                    }
+                  print_delimiter = true;
                 }
-              print_delimiter = true;
+
               putchar (c);
             }
         }
@@ -597,6 +508,8 @@ cut_fields (FILE *stream)
   bool found_any_selected_field = false;
   bool buffer_first_field;
 
+  current_rp = rp;
+
   c = getc (stream);
   if (c == EOF)
     return;
@@ -609,7 +522,7 @@ cut_fields (FILE *stream)
      and the first field has been selected, or if non-delimited lines
      must be suppressed and the first field has *not* been selected.
      That is because a non-delimited line has exactly one field.  */
-  buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
+  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
 
   while (1)
     {
@@ -650,7 +563,7 @@ cut_fields (FILE *stream)
                 }
               continue;
             }
-          if (print_kth (1, NULL))
+          if (print_kth (1))
             {
               /* Print the field, but not the trailing delimiter.  */
               fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
@@ -661,7 +574,7 @@ cut_fields (FILE *stream)
 
       if (c != EOF)
         {
-          if (print_kth (field_idx, NULL))
+          if (print_kth (field_idx))
             {
               if (found_any_selected_field)
                 {
@@ -695,7 +608,11 @@ cut_fields (FILE *stream)
         }
 
       if (c == delim)
-        ++field_idx;
+        {
+          ++field_idx;
+          if ((field_idx > current_rp->hi) && (current_rp < rp + n_rp - 1))
+            ++current_rp;
+        }
       else if (c == '\n' || c == EOF)
         {
           if (found_any_selected_field
@@ -704,6 +621,7 @@ cut_fields (FILE *stream)
           if (c == EOF)
             break;
           field_idx = 1;
+          current_rp = rp;
           found_any_selected_field = false;
         }
     }
@@ -854,16 +772,6 @@ main (int argc, char **argv)
     FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
 \tonly when operating on fields"));
 
-  if (output_delimiter_specified)
-    {
-      range_start_ht = hash_initialize (HT_RANGE_START_INDEX_INITIAL_CAPACITY,
-                                        NULL, hash_int,
-                                        hash_compare_ints, NULL);
-      if (range_start_ht == NULL)
-        xalloc_die ();
-
-    }
-
   if (! set_fields (spec_list_string))
     {
       if (operating_mode == field_mode)
@@ -890,8 +798,6 @@ main (int argc, char **argv)
     for (ok = true; optind < argc; optind++)
       ok &= cut_file (argv[optind]);
 
-  if (range_start_ht)
-    hash_free (range_start_ht);
 
   if (have_read_stdin && fclose (stdin) == EOF)
     {
diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index 0f0a3a3..120880c 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -182,6 +182,10 @@ my @Tests =
                                          {IN=>"123456\n"}, {OUT=>"23456\n"}],
   ['EOL-subsumed-3', '--complement -b3,4-4,5,2-',
                                          {IN=>"123456\n"}, {OUT=>"1\n"}],
+
+  ['EOL-subsumed-4', '--output-d=: -b1-2,2-3,3-',
+                                        {IN=>"1234\n"}, {OUT=>"1234\n"}],
+
  );
 
 if ($mb_locale ne 'C')
-- 
1.8.0.1


Best regards,
Cojocaru Alexandru

[DIFF-new-impl (application/octet-stream, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 09 Dec 2012 20:46:02 GMT) Full text and rfc822 format available.

Message #8 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Jim Meyering <jim <at> meyering.net>
To: Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 09 Dec 2012 21:45:03 +0100

Cojocaru Alexandru wrote:
> Subject: [PATCH] cut: use only one data structure
>
> The current implementation of cut, uses a bit array,
> an array of `struct range_pair's, and (when --output-delimiter
> is specified) a hash_table. The new implementation will use
> only an array of `struct range_pair's.
> The old implementation is inefficient for the following reasons:
>  1. When -b with a big num is specified, it allocates a lot of useless
>     memory for `printable_field'.
>  2. When --output-delimiter is specified, it will allocate 31 buckets.
>     Even if a few ranges are specified.
...
> -/* Given the list of field or byte range specifications FIELDSTR, set
> -   MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
> -   array.  If there is a right-open-ended range, set EOL_RANGE_START
> -   to its starting index.  FIELDSTR should be composed of one or more
> -   numbers or ranges of numbers, separated by blanks or commas.
> -   Incomplete ranges may be given: '-m' means '1-m'; 'n-' means 'n'
> -   through end of line.  Return true if FIELDSTR contains at least
> -   one field specification, false otherwise.  */
> -
> -/* FIXME-someday:  What if the user wants to cut out the 1,000,000-th
> -   field of some huge input file?  This function shouldn't have to
> -   allocate a table of a million bits just so we can test every
> -   field < 10^6 with an array dereference.  Instead, consider using
> -   an adaptive approach: if the range of selected fields is too large,
> -   but only a few fields/byte-offsets are actually selected, use a
> -   hash table.  If the range of selected fields is too large, and
> -   too many are selected, then resort to using the range-pairs (the
> -   'rp' array) directly.  */

Thanks for the patch.
This is large enough that you'll have to file a copyright assignment.
For details, see the "Copyright assignment" section in the file
named HACKING.

Have you considered performance in the common case?
I suspect that a byte or field number larger than 1000 is
not common.  That is why, in the FIXME comment above,
I suggested to use an adaptive approach.  I had the feeling
(don't remember if I profiled it) that testing a bit per
input field would be more efficient than an in-range test.

If you construct test cases and gather timing data, please do so
in a reproducible manner and include details when you report back,
so we can compare on different types of systems.  Cache matters a
lot, these days.

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Tue, 11 Dec 2012 14:27:02 GMT) Full text and rfc822 format available.

Message #11 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Cojocaru Alexandru <xojoc <at> gmx.com>
To: Jim Meyering <jim <at> meyering.net>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Tue, 11 Dec 2012 15:24:36 +0100

[Message part 1 (text/plain, inline)]

On Sun, 09 Dec 2012 21:45:03 +0100
Jim Meyering <jim <at> meyering.net> wrote:

> Thanks for the patch.
> This is large enough that you'll have to file a copyright assignment.
> For details, see the "Copyright assignment" section in the file
> named HACKING.

Fine.


> Have you considered performance in the common case?
> I suspect that a byte or field number larger than 1000 is
> not common.  That is why, in the FIXME comment above,
> I suggested to use an adaptive approach.  I had the feeling
> (don't remember if I profiled it) that testing a bit per
> input field would be more efficient than an in-range test.

Yes, it was the first thing I checked. And there's no performance loss.


> If you construct test cases and gather timing data, please do so
> in a reproducible manner and include details when you report back,
> so we can compare on different types of systems.

Here are my benchmarks:

OS:       Parabola GNU/linux-libre (linux-libre v3.6.8-1)
Compiler: GCC 4.7.2
Cflags:   -O2
LANG:     C
CPU:      Intel Core Duo  (1.86 GHz) (L1 Cache 64KiB) (L2 Cache 2MiB)
Main memory:
 - Bank 0: DIMM DRAM Synchronous (1GiB) (width 64 bits)
 - Bank 1: DIMM DRAM Synchronous (1GiB) (width 64 bits)

NOTE: information gathered with `lshw'.


Summary (see the attached file for complete data):

### small ranges
cut-pre: 0:01.84
cut-post: 0:01.36
cut-split: 0:01.25

### bigger ranges
cut-pre: 0:11.74
cut-post: 0:09.20
cut-split: 0:07.91 ***

### fields
cut-pre: 0:02.90
cut-post: 0:02.68
cut-split: 0:02.85

### --output-delimiter
cut-pre: 0:02.90
cut-post: 0:02.74
cut-split: 0:02.80


NOTES:
 cut-pre is the current implementation and was compiled from commit ec48beadf.
 cut-post was compiled after applying the above patch to commit ec48beadf.
 cut-split was compiled after applying the `split-print_kth' patch to commit ec48beadf.


The main advantages cames from splitting `print_kth' into two
separate functions, so now `print_kth' does fewer checks.


Best regards,
Cojocaru Alexandru

[full-data.txt (text/plain, attachment)]

[split-print_kth (application/octet-stream, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Fri, 26 Apr 2013 16:08:02 GMT) Full text and rfc822 format available.

Message #14 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: xojoc <at> gmx.com
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Fri, 26 Apr 2013 17:07:01 +0100

[Message part 1 (text/plain, inline)]

I've rebased this to master and attached.
The rebase wasn't trivial so I might have messed up.
The cut.pl test is now failing on master. Could you have a look.
Also could you add a test (or just a bit of shell) to demonstrate
which options the memory is not allocated for example.
Ideally some pathological option combo that no longer
allocates huge amounts of RAM.

thanks,
Pádraig.

[cut-mem.patch (text/x-patch, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Fri, 26 Apr 2013 16:12:01 GMT) Full text and rfc822 format available.

Message #17 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: xojoc <at> gmx.com
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Fri, 26 Apr 2013 17:11:49 +0100

[Message part 1 (text/plain, inline)]

This separate patch to simplify the print_kth() function
by removing the comparison from it, is simple and
has a significant perf advantage. Tests pass so I'll apply.

I'll adjust the commit log to summarise the perf change,
but I notice the change isn't as great as yours on my sandybridge i3 system.
Benchmark results for both the rebased memory rework and
the simple print_kth() optimization attached.

thanks!
Pádraig.

[split-pb-results (text/plain, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Fri, 26 Apr 2013 18:53:02 GMT) Full text and rfc822 format available.

Message #20 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: "Alexandru Cojocaru" <xojoc <at> gmx.com>
To: "Pádraig Brady" <P <at> draigBrady.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Fri, 26 Apr 2013 20:52:35 +0200

Hi,
sorry for the delay.

From: Pádraig Brady
> Sent: 04/26/13 04:07 PM
> To: xojoc <at> gmx.com
> Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre

> The rebase wasn't trivial so I might have messed up.
Hum, I had problems only with `cut.pl'.

> The cut.pl test is now failing on master. Could you have a look.
I had no problems. Could you show me your output?

> Also could you add a test (or just a bit of shell) to demonstrate
> which options the memory is not allocated for example.
> Ideally some pathological option combo that no longer
> allocates huge amounts of RAM.
$ echo a | cut -b1-$(echo '2^32-1'|bc)
cut: memory exhausted

Could you please write the test? It seems that I should use $limits
but I don't know how exactly :-). Thanks.

> thanks,
> Pádraig.

Best regards,
Cojocaru Alexandru

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Fri, 26 Apr 2013 19:26:01 GMT) Full text and rfc822 format available.

Message #23 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Alexandru Cojocaru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Fri, 26 Apr 2013 20:24:54 +0100

On 04/26/2013 07:52 PM, Alexandru Cojocaru wrote:
> Hi,
> sorry for the delay.
> 
> From: Pádraig Brady
>> Sent: 04/26/13 04:07 PM
>> To: xojoc <at> gmx.com
>> Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
> 
>> The rebase wasn't trivial so I might have messed up.
> Hum, I had problems only with `cut.pl'.

Did you pull the latest master?
The last patch I sent is against that.

>> The cut.pl test is now failing on master. Could you have a look.
> I had no problems. Could you show me your output?

Ah the failures are in tests I added in the meantime:
http://git.sv.gnu.org/gitweb/?p=coreutils.git;a=commit;h=51ce0bf8

Specifically this is now only outputting the first field,
rather than both fields like it should:

printf '%s\n' a:1 b:2 | src/cut -s -d: -f1,2

>> Also could you add a test (or just a bit of shell) to demonstrate
>> which options the memory is not allocated for example.
>> Ideally some pathological option combo that no longer
>> allocates huge amounts of RAM.
> $ echo a | cut -b1-$(echo '2^32-1'|bc)
> cut: memory exhausted

Ok cool, I was just ensuring I didn't miss anything.

> Could you please write the test? It seems that I should use $limits
> but I don't know how exactly :-). Thanks.

I'll write a test based on:
(ulimit -v 20000; : | cut -b1-$((2**32-1))) || echo fail

thanks,
Pádraig.

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Fri, 26 Apr 2013 22:50:01 GMT) Full text and rfc822 format available.

Message #26 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: "Alexandru Cojocaru" <xojoc <at> gmx.com>
To: "Pádraig Brady" <P <at> draigBrady.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sat, 27 Apr 2013 00:49:34 +0200

[Message part 1 (text/plain, inline)]

From: Pádraig Brady
> Sent: 04/26/13 07:24 PM
> To: Alexandru Cojocaru
> Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
> 
> On 04/26/2013 07:52 PM, Alexandru Cojocaru wrote:
> > Hi,
> > sorry for the delay.
> > 
> > From: Pádraig Brady
> >> Sent: 04/26/13 04:07 PM
> >> To: xojoc <at> gmx.com
> >> Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
> > 
> >> The rebase wasn't trivial so I might have messed up.
> > Hum, I had problems only with `cut.pl'.
> 
> Did you pull the latest master?
> The last patch I sent is against that.
Ah, yeah I used your patch. This is why it worked.

> >> The cut.pl test is now failing on master. Could you have a look.
> > I had no problems. Could you show me your output?
> 
> Ah the failures are in tests I added in the meantime:
> http://git.sv.gnu.org/gitweb/?p=coreutils.git;a=commit;h=51ce0bf8
> 
> Specifically this is now only outputting the first field,
> rather than both fields like it should:
> 
> printf '%s\n' a:1 b:2 | src/cut -s -d: -f1,2
The problem was caused by `current_rp' which wasn't
incremented as needed. See attachment for patch.
My tests were succesfull, can you recheck?

> >> Also could you add a test (or just a bit of shell) to demonstrate
> >> which options the memory is not allocated for example.
> >> Ideally some pathological option combo that no longer
> >> allocates huge amounts of RAM.
> > $ echo a | cut -b1-$(echo '2^32-1'|bc)
> > cut: memory exhausted
> 
> Ok cool, I was just ensuring I didn't miss anything.
> 
> > Could you please write the test? It seems that I should use $limits
> > but I don't know how exactly :-). Thanks.
> 
> I'll write a test based on:
> (ulimit -v 20000; : | cut -b1-$((2**32-1))) || echo fail
Thanks for the test case.

> thanks,
> Pádraig.

Best regards,
Cojocaru Alexandru

[cut-fix-bug.patch (application/octet-stream, attachment)]

Reply sent to Pádraig Brady <P <at> draigBrady.com>:
You have taken responsibility. (Fri, 26 Apr 2013 23:47:02 GMT) Full text and rfc822 format available.

Notification sent to xojoc <at> gmx.com:
bug acknowledged by developer. (Fri, 26 Apr 2013 23:47:02 GMT) Full text and rfc822 format available.

Message #31 received at 13127-done <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Alexandru Cojocaru <xojoc <at> gmx.com>
Cc: 13127-done <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sat, 27 Apr 2013 00:46:43 +0100

On 04/26/2013 11:49 PM, Alexandru Cojocaru wrote:

>>>> The cut.pl test is now failing on master. Could you have a look.
>>> I had no problems. Could you show me your output?
>>
>> Ah the failures are in tests I added in the meantime:
>> http://git.sv.gnu.org/gitweb/?p=coreutils.git;a=commit;h=51ce0bf8
>>
>> Specifically this is now only outputting the first field,
>> rather than both fields like it should:
>>
>> printf '%s\n' a:1 b:2 | src/cut -s -d: -f1,2
> The problem was caused by `current_rp' which wasn't
> incremented as needed. See attachment for patch.
> My tests were succesfull, can you recheck?

Great tests now pass here.
I'll give it a thorough review tomorrow and apply.

thanks,
Pádraig.

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 01:52:02 GMT) Full text and rfc822 format available.

Message #34 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: xojoc <at> gmx.com
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 02:51:13 +0100

On 04/26/2013 05:11 PM, Pádraig Brady wrote:
> This separate patch to simplify the print_kth() function
> by removing the comparison from it, is simple and
> has a significant perf advantage. Tests pass so I'll apply.
> 
> I'll adjust the commit log to summarise the perf change,
> but I notice the change isn't as great as yours on my sandybridge i3 system.
> Benchmark results for both the rebased memory rework and
> the simple print_kth() optimization attached.

So looking in detail, this central print_kth function is of most importance to performance.
I thought that your simplification of it might allow it to be auto inlined.
but I confirmed that gcc 4.6.0 -O2 does not do this at present by doing:

  objdump -d src/cut.o | grep -q '<print_kth>:' && echo called || echo inlined

Marking it as inlined gives another gain as shown below.

Testing these combinations, we have:
orig = bit array implementation
split = ditto + simplified print_kth
split-inline = ditto + inlined print_kth
mem = no bit array
mem-split = ditto + simplified print_kth
mem-inline = ditto + inlined print_kth

$ yes abcdfeg | head -n1MB > big-file
$ for c in orig split split-inline mem mem-split mem-split-inline; do
    src/cut-$c 2>/dev/null
    echo -ne "\n== $c =="
    time src/cut-$c -b1,3 big-file > /dev/null
  done

== orig ==
real	0m0.084s
user	0m0.078s
sys	0m0.006s

== split ==
real	0m0.077s
user	0m0.070s
sys	0m0.006s

== split-inline ==
real	0m0.055s
user	0m0.049s
sys	0m0.006s

== mem ==
real	0m0.111s
user	0m0.108s
sys	0m0.002s

== mem-split ==
real	0m0.088s
user	0m0.081s
sys	0m0.007s

== mem-split-inline ==
real	0m0.070s
user	0m0.060s
sys	0m0.009s

So in summary, removing the bit array does slow things down,
but with the advantage of disassociating mem usage from range width.
I'll split the patch into two for the mem change and the cpu change,
and might follow up with a subsequent patch to reinstate the bit array
for the common case of small -[bcf] and no --output-delim.
That's a common trend in these mem adjustment patches.
I.E. Find a point to switch from the more CPU efficient method,
to one which is more memory efficient.

thanks,
Pádraig.

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 02:41:05 GMT) Full text and rfc822 format available.

Message #37 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: xojoc <at> gmx.com
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 03:40:09 +0100

[Message part 1 (text/plain, inline)]

On 04/26/2013 05:07 PM, Pádraig Brady wrote:
> I've rebased this to master and attached.
> The rebase wasn't trivial so I might have messed up.
> The cut.pl test is now failing on master. Could you have a look.
> Also could you add a test (or just a bit of shell) to demonstrate
> which options the memory is not allocated for example.
> Ideally some pathological option combo that no longer
> allocates huge amounts of RAM.

I refactored a little more (see next_item()).
Also split to two patches and added some benchmarks.

Will push the attached in a few hours.

thanks,
Pádraig.

[cut-mem.patch (text/x-patch, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 11:45:01 GMT) Full text and rfc822 format available.

Message #40 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Cojocaru Alexandru <xojoc <at> gmx.com>
To: Pádraig Brady <P <at> draigBrady.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 13:44:09 +0200

On Sun, 28 Apr 2013 02:51:13 +0100
Pádraig Brady <P <at> draigBrady.com> wrote:
> So looking in detail, this central print_kth function is of most importance to performance.
I made the same conclusion as yours, see:
	http://lists.gnu.org/archive/html/bug-coreutils/2012-12/msg00045.html

> I thought that your simplification of it might allow it to be auto inlined.
> but I confirmed that gcc 4.6.0 -O2 does not do this at present by doing:
> 
>   objdump -d src/cut.o | grep -q '<print_kth>:' && echo called || echo inlined
With gcc 4.8.0 -O2 both `print_kth' and `is_range_start' are inlined
even without the `inline' keyword:
    nm src/cut | grep print_kth
    nm src/cut | grep is_range_start
both the above comands give me no output.

> Marking it as inlined gives another gain as shown below.
> 
> Testing these combinations, we have:
> orig = bit array implementation
> split = ditto + simplified print_kth
> split-inline = ditto + inlined print_kth
> mem = no bit array
> mem-split = ditto + simplified print_kth
> mem-inline = ditto + inlined print_kth
> 
> $ yes abcdfeg | head -n1MB > big-file
> $ for c in orig split split-inline mem mem-split mem-split-inline; do
>     src/cut-$c 2>/dev/null
>     echo -ne "\n== $c =="
>     time src/cut-$c -b1,3 big-file > /dev/null
>   done
> 
> == orig ==
> real	0m0.084s
> user	0m0.078s
> sys	0m0.006s
> 
> == split ==
> real	0m0.077s
> user	0m0.070s
> sys	0m0.006s
> 
> == split-inline ==
> real	0m0.055s
> user	0m0.049s
> sys	0m0.006s
> 
> == mem ==
> real	0m0.111s
> user	0m0.108s
> sys	0m0.002s
> 
> == mem-split ==
> real	0m0.088s
> user	0m0.081s
> sys	0m0.007s
> 
> == mem-split-inline ==
> real	0m0.070s
> user	0m0.060s
> sys	0m0.009s
> 
> So in summary, removing the bit array does slow things down,
I think that the problem lies in `print_kth' again. I've wrongly put
an useless branch in it. See the attachment for a patch.
Another problem may be the merging and the call to `xrealloc' that
we do at the end of `set_fields'.

> but with the advantage of disassociating mem usage from range width.
> I'll split the patch into two for the mem change and the cpu change,
> and might follow up with a subsequent patch to reinstate the bit array
> for the common case of small -[bcf] and no --output-delim.
My primary goal was to simplify the code. Even if the attached patch
dosen't work, I think that detecting small -[bcf] ranges would just make
the code more cumbersome.

> That's a common trend in these mem adjustment patches.
> I.E. Find a point to switch from the more CPU efficient method,
> to one which is more memory efficient.
> 
> thanks,
> Pádraig.

Please could you re-run the benchmarks after applying the patch?
Could you also try with a bigger file (for example 100MB)? So as
to make the difference among the various patches more clear.
Unfortunately I'm under an emulator and the benchmarks aren't very
faithful.

Best regards,
Cojocaru Alexandru

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 14:05:02 GMT) Full text and rfc822 format available.

Message #43 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 15:04:31 +0100

On 04/28/2013 12:44 PM, Cojocaru Alexandru wrote:
> On Sun, 28 Apr 2013 02:51:13 +0100
> Pádraig Brady <P <at> draigBrady.com> wrote:
>> So looking in detail, this central print_kth function is of most importance to performance.
> I made the same conclusion as yours, see:
> 	http://lists.gnu.org/archive/html/bug-coreutils/2012-12/msg00045.html
> 
>> I thought that your simplification of it might allow it to be auto inlined.
>> but I confirmed that gcc 4.6.0 -O2 does not do this at present by doing:
>>
>>   objdump -d src/cut.o | grep -q '<print_kth>:' && echo called || echo inlined
> With gcc 4.8.0 -O2 both `print_kth' and `is_range_start' are inlined
> even without the `inline' keyword:
>     nm src/cut | grep print_kth
>     nm src/cut | grep is_range_start
> both the above comands give me no output.

Good gcc is getting better.
We'll leave the inline for now at least,
to aid non bleeding edge gcc.

>> Marking it as inlined gives another gain as shown below.
>>
>> Testing these combinations, we have:
>> orig = bit array implementation
>> split = ditto + simplified print_kth
>> split-inline = ditto + inlined print_kth
>> mem = no bit array
>> mem-split = ditto + simplified print_kth
>> mem-inline = ditto + inlined print_kth
>>
>> $ yes abcdfeg | head -n1MB > big-file
>> $ for c in orig split split-inline mem mem-split mem-split-inline; do
>>     src/cut-$c 2>/dev/null
>>     echo -ne "\n== $c =="
>>     time src/cut-$c -b1,3 big-file > /dev/null
>>   done
>>
>> == orig ==
>> real	0m0.084s
>> user	0m0.078s
>> sys	0m0.006s
>>
>> == split ==
>> real	0m0.077s
>> user	0m0.070s
>> sys	0m0.006s
>>
>> == split-inline ==
>> real	0m0.055s
>> user	0m0.049s
>> sys	0m0.006s
>>
>> == mem ==
>> real	0m0.111s
>> user	0m0.108s
>> sys	0m0.002s
>>
>> == mem-split ==
>> real	0m0.088s
>> user	0m0.081s
>> sys	0m0.007s
>>
>> == mem-split-inline ==
>> real	0m0.070s
>> user	0m0.060s
>> sys	0m0.009s
>>
>> So in summary, removing the bit array does slow things down,
> I think that the problem lies in `print_kth' again. I've wrongly put
> an useless branch in it. See the attachment for a patch.

Did you forget to attach?

> Another problem may be the merging and the call to `xrealloc' that
> we do at the end of `set_fields'.

That's only called at startup so I wouldn't worry too much.
What was your specific concern here?

>> but with the advantage of disassociating mem usage from range width.
>> I'll split the patch into two for the mem change and the cpu change,
>> and might follow up with a subsequent patch to reinstate the bit array
>> for the common case of small -[bcf] and no --output-delim.
> My primary goal was to simplify the code. Even if the attached patch
> dosen't work, I think that detecting small -[bcf] ranges would just make
> the code more cumbersome.

Yes it's a trade off. For often used tools such as coreutils though,
it's sometimes worth a little bit extra complexity for performance reasons.
Here we might be able to guide the compiler around the branches like:

print_kth()
{
  if likely(bitarray_used)
     ...
  else
     ...
}

Anyway I'll wait for your patch before carefully considering
to reinstate the bit array.

>> That's a common trend in these mem adjustment patches.
>> I.E. Find a point to switch from the more CPU efficient method,
>> to one which is more memory efficient.
>>
>> thanks,
>> Pádraig.
> 
> Please could you re-run the benchmarks after applying the patch?
> Could you also try with a bigger file (for example 100MB)? So as
> to make the difference among the various patches more clear.
> Unfortunately I'm under an emulator and the benchmarks aren't very
> faithful.

Sure. Eagerly waiting the patch :)

Pádraig.

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 17:22:01 GMT) Full text and rfc822 format available.

Message #46 received at submit <at> debbugs.gnu.org (full text, mbox):

From: Philipp Thomas <philipp <at> thogro.org>
To: bug-coreutils <at> gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 09:24:47 +0200

Am 28.04.2013 04:40, schrieb Pádraig Brady:

>  2. When --output-delimiter is specified, it will allocate 31 buckets.
>     Even if a few ranges are specified.

Shouldn't this be "Even if only a few ranges are specified"?

Philipp

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 18:15:01 GMT) Full text and rfc822 format available.

Message #49 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Cojocaru Alexandru <xojoc <at> gmx.com>
To: Pádraig Brady <P <at> draigBrady.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 20:14:33 +0200

[Message part 1 (text/plain, inline)]

On Sun, 28 Apr 2013 15:04:31 +0100
Pádraig Brady <P <at> draigBrady.com> wrote:

> On 04/28/2013 12:44 PM, Cojocaru Alexandru wrote:
> > Another problem may be the merging and the call to `xrealloc' that
> > we do at the end of `set_fields'.
> 
> That's only called at startup so I wouldn't worry too much.
> What was your specific concern here?
The file you used with the benchmarks was quite small, so I was
worring that both the loop used for the merging and the call to
`xrealloc' was affecting too much the benchmarks.

> >> but with the advantage of disassociating mem usage from range width.
> >> I'll split the patch into two for the mem change and the cpu change,
> >> and might follow up with a subsequent patch to reinstate the bit array
> >> for the common case of small -[bcf] and no --output-delim.
> > My primary goal was to simplify the code. Even if the attached patch
> > dosen't work, I think that detecting small -[bcf] ranges would just make
> > the code more cumbersome.
> 
> Yes it's a trade off. For often used tools such as coreutils though,
> it's sometimes worth a little bit extra complexity for performance reasons.
> Here we might be able to guide the compiler around the branches like:
> 
> print_kth()
> {
>   if likely(bitarray_used)
>      ...
>   else
>      ...
> }
Ok.

> Anyway I'll wait for your patch before carefully considering
> to reinstate the bit array.
Please, give me some more time. I think that it would be possible to
use the sentinel to speed up things a bit.
[...]
> Sure. Eagerly waiting the patch :)
Attached.


Best regards,
Cojocaru Alexandru

[avoid-branch.patch (application/octet-stream, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Sun, 28 Apr 2013 20:12:01 GMT) Full text and rfc822 format available.

Message #52 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sun, 28 Apr 2013 21:11:10 +0100

On 04/28/2013 07:14 PM, Cojocaru Alexandru wrote:
> On Sun, 28 Apr 2013 15:04:31 +0100
> Pádraig Brady <P <at> draigBrady.com> wrote:
> 
>> On 04/28/2013 12:44 PM, Cojocaru Alexandru wrote:
>>> Another problem may be the merging and the call to `xrealloc' that
>>> we do at the end of `set_fields'.
>>
>> That's only called at startup so I wouldn't worry too much.
>> What was your specific concern here?
> The file you used with the benchmarks was quite small, so I was
> worring that both the loop used for the merging and the call to
> `xrealloc' was affecting too much the benchmarks.

Ah right. I've enough to see relative differences quite easily,
but will increase further benchmark sizes if needed.

>>>> but with the advantage of disassociating mem usage from range width.
>>>> I'll split the patch into two for the mem change and the cpu change,
>>>> and might follow up with a subsequent patch to reinstate the bit array
>>>> for the common case of small -[bcf] and no --output-delim.
>>> My primary goal was to simplify the code. Even if the attached patch
>>> dosen't work, I think that detecting small -[bcf] ranges would just make
>>> the code more cumbersome.
>>
>> Yes it's a trade off. For often used tools such as coreutils though,
>> it's sometimes worth a little bit extra complexity for performance reasons.
>> Here we might be able to guide the compiler around the branches like:
>>
>> print_kth()
>> {
>>   if likely(bitarray_used)
>>      ...
>>   else
>>      ...
>> }
> Ok.
> 
>> Anyway I'll wait for your patch before carefully considering
>> to reinstate the bit array.
> Please, give me some more time. I think that it would be possible to
> use the sentinel to speed up things a bit.

Sure.

> [...]
>> Sure. Eagerly waiting the patch :)
> Attached.

That changes the else to an ||
I thought gcc would optimize that to the same code.
While the assembly generated is a little different,
the performance of both is essentially the same.

thanks,
Pádraig.

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Mon, 29 Apr 2013 00:00:02 GMT) Full text and rfc822 format available.

Message #55 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Mon, 29 Apr 2013 00:59:20 +0100

[Message part 1 (text/plain, inline)]

So I reinstated the bit vector which was a little tricky
to do while maintaining performance, but it works very well.
So in summary with the attached 3 patch series, the CPU
usage of the common cut path is nearly halved, while the
max memory that will be allocated for the bit vector is 64KiB.

I'll apply this series in the morning.

thanks,
Pádraig.

p.s. I doubt adding a sentinel to the range pair structure
would out performance the bit vector approach, given the
significant benefit shown in the benchmark in the commit message.

[cut-mem.patch (text/x-patch, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Mon, 06 May 2013 18:56:02 GMT) Full text and rfc822 format available.

Message #58 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Cojocaru Alexandru <xojoc <at> gmx.com>
To: Pádraig Brady <P <at> draigBrady.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Mon, 6 May 2013 20:54:01 +0200

[Message part 1 (text/plain, inline)]

On Mon, 29 Apr 2013 00:59:20 +0100
Pádraig Brady <P <at> draigBrady.com> wrote:

> So I reinstated the bit vector which was a little tricky
> to do while maintaining performance, but it works very well.
I think it works because we are avoiding a memory access
inside `next_item' this way.

With this patch I try to keep the CPU benefits for `--output-d'
and when large ranges are specified, even without the bitarray.

Because of the sentinel now the max line len supported will be
`(size_t)-1 - 1' and no more `(size_t)-1'. Is this an issue?

PS: This patch also fix a little bug inside `set_fields'.

Best regards,
Cojocaru Alexandru

[cut.patch (application/octet-stream, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Tue, 07 May 2013 12:12:02 GMT) Full text and rfc822 format available.

Message #61 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Tue, 07 May 2013 13:10:08 +0100

[Message part 1 (text/plain, inline)]

On 05/06/2013 07:54 PM, Cojocaru Alexandru wrote:
> On Mon, 29 Apr 2013 00:59:20 +0100
> Pádraig Brady <P <at> draigBrady.com> wrote:
> 
>> So I reinstated the bit vector which was a little tricky
>> to do while maintaining performance, but it works very well.
> I think it works because we are avoiding a memory access
> inside `next_item' this way.
> 
> With this patch I try to keep the CPU benefits for `--output-d'
> and when large ranges are specified, even without the bitarray.
> 
> Because of the sentinel now the max line len supported will be
> `(size_t)-1 - 1' and no more `(size_t)-1'. Is this an issue?
> 
> PS: This patch also fix a little bug inside `set_fields'.

It's always best to have separate changes.
I've split the fix out (attached) with an associated test.

thanks,
Pádraig.

[cut-merge-fix.patch (text/x-patch, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Tue, 07 May 2013 13:54:01 GMT) Full text and rfc822 format available.

Message #64 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Pádraig Brady <P <at> draigBrady.com>
To: Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Tue, 07 May 2013 14:51:52 +0100

[Message part 1 (text/plain, inline)]

On 05/07/2013 01:10 PM, Pádraig Brady wrote:
> On 05/06/2013 07:54 PM, Cojocaru Alexandru wrote:
>> On Mon, 29 Apr 2013 00:59:20 +0100
>> Pádraig Brady <P <at> draigBrady.com> wrote:
>>
>>> So I reinstated the bit vector which was a little tricky
>>> to do while maintaining performance, but it works very well.
>> I think it works because we are avoiding a memory access
>> inside `next_item' this way.
>>
>> With this patch I try to keep the CPU benefits for `--output-d'
>> and when large ranges are specified, even without the bitarray.
>>
>> Because of the sentinel now the max line len supported will be
>> `(size_t)-1 - 1' and no more `(size_t)-1'. Is this an issue?

Not a practical one.
We could bump the types/limits in the range pairs
up to uintmax_t since we're now not allocating
lot of corresponding memory.
Note I added a specific check to make it
explicit that -b$SIZE_MAX is not supported if specified.

I'll do that in a subsequent patch, but it's
not a practical issue for now, as we still allocate mem
for the whole line.

The new patch performs well!

I'll apply the attached in a little while.

thanks!
Pádraig.

[cut-sentinel.patch (text/x-patch, attachment)]

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Wed, 08 May 2013 06:56:02 GMT) Full text and rfc822 format available.

Message #67 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Bernhard Voelker <mail <at> bernhard-voelker.de>
To: "Pádraig Brady" <P <at> draigBrady.com>, 
	Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Wed, 8 May 2013 08:54:26 +0200 (CEST)

> On May 7, 2013 at 2:10 PM Pádraig Brady <P <at> draigBrady.com> wrote:
> It's always best to have separate changes.
> I've split the fix out (attached) with an associated test.

The patch looks fine, thanks.

Have a nice day,
Berny

Information forwarded to bug-coreutils <at> gnu.org:
bug#13127; Package coreutils. (Wed, 08 May 2013 06:57:02 GMT) Full text and rfc822 format available.

Message #70 received at 13127 <at> debbugs.gnu.org (full text, mbox):

From: Bernhard Voelker <mail <at> bernhard-voelker.de>
To: "Pádraig Brady" <P <at> draigBrady.com>, 
	Cojocaru Alexandru <xojoc <at> gmx.com>
Cc: 13127 <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Wed, 8 May 2013 08:54:53 +0200 (CEST)

> On May 7, 2013 at 3:51 PM Pádraig Brady <P <at> draigBrady.com> wrote:
> I'll apply the attached in a little while.

+1

Have a nice day,
Berny

bug archived. Request was from Debbugs Internal Request <help-debbugs <at> gnu.org> to internal_control <at> debbugs.gnu.org. (Wed, 05 Jun 2013 11:24:04 GMT) Full text and rfc822 format available.

This bug report was last modified 10 years and 326 days ago.

Previous Next

GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.

GNU bug report logs - #13127 [PATCH] cut: use only one data strucutre

GNU bug report logs - #13127
[PATCH] cut: use only one data strucutre