GNU bug report logs - #24260
[PATCH 1/6] dfa: thread-safety: remove 'dfa' global in dfa.c

Package: grep;

Reported by: Zev Weiss <zev <at> bewilderbeest.net>

Date: Thu, 18 Aug 2016 10:51:02 UTC

Severity: normal

Tags: patch

Done: Jim Meyering <jim <at> meyering.net>

Bug is archived. No further changes may be made.

To add a comment to this bug, you must first unarchive it, by sending
a message to control AT debbugs.gnu.org, with unarchive 24260 in the body.
You can then email your comments to 24260 AT debbugs.gnu.org in the normal way.

Toggle the display of automated, internal messages from the tracker.

View this report as an mbox folder, status mbox, maintainer mbox

Report forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Thu, 18 Aug 2016 10:51:02 GMT) Full text and rfc822 format available.

Acknowledgement sent to Zev Weiss <zev <at> bewilderbeest.net>:
New bug report received and forwarded. Copy sent to bug-grep <at> gnu.org. (Thu, 18 Aug 2016 10:51:02 GMT) Full text and rfc822 format available.

Message #5 received at submit <at> debbugs.gnu.org (full text, mbox):

From: Zev Weiss <zev <at> bewilderbeest.net>
To: bug-grep <at> gnu.org
Cc: Zev Weiss <zev <at> bewilderbeest.net>
Subject: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in dfa.c
Date: Thu, 18 Aug 2016 05:50:14 -0500

* src/dfa.c: remove global dfa struct.  A pointer to a struct dfa is
instead added as a parameter to the functions that had been using the
global.
---
 src/dfa.c | 207 +++++++++++++++++++++++++++++---------------------------------
 1 file changed, 98 insertions(+), 109 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index d337bb6..5bd2a92 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -451,7 +451,7 @@ struct dfa
 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
   SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
 
-static void regexp (void);
+static void regexp (struct dfa *dfa);
 
 /* A table indexed by byte values that contains the corresponding wide
    character (if any) for that byte.  WEOF means the byte is not a
@@ -670,16 +670,6 @@ dfa_charclass_index (struct dfa *d, charclass const s)
   return i;
 }
 
-/* A pointer to the current dfa is kept here during parsing.  */
-static struct dfa *dfa;
-
-/* Find the index of charclass S in the current DFA, or allocate a new one.  */
-static size_t
-charclass_index (charclass const s)
-{
-  return dfa_charclass_index (dfa, s);
-}
-
 /* Syntax bits controlling the behavior of the lexical analyzer.  */
 static reg_syntax_t syntax_bits;
 static bool syntax_bits_set;
@@ -807,7 +797,7 @@ using_utf8 (void)
    processed more efficiently.  */
 
 static bool
-using_simple_locale (void)
+using_simple_locale (struct dfa *dfa)
 {
   /* The native character set is known to be compatible with
      the C locale.  The following test isn't perfect, but it's good
@@ -870,7 +860,7 @@ static wint_t wctok;		/* Wide character representation of the current
    of length 1); otherwise set WC to WEOF.  If there is no more input,
    report EOFERR if EOFERR is not null, and return lasttok = END
    otherwise.  */
-# define FETCH_WC(c, wc, eoferr)		\
+# define FETCH_WC(dfa, c, wc, eoferr)		\
   do {						\
     if (! lexleft)				\
       {						\
@@ -984,7 +974,7 @@ find_pred (const char *str)
 /* Multibyte character handling sub-routine for lex.
    Parse a bracket expression and build a struct mb_char_classes.  */
 static token
-parse_bracket_exp (void)
+parse_bracket_exp (struct dfa *dfa)
 {
   bool invert;
   int c, c1, c2;
@@ -1028,12 +1018,12 @@ parse_bracket_exp (void)
     work_mbc = NULL;
 
   memset (ccl, 0, sizeof ccl);
-  FETCH_WC (c, wc, _("unbalanced ["));
+  FETCH_WC (dfa, c, wc, _("unbalanced ["));
   if (c == '^')
     {
-      FETCH_WC (c, wc, _("unbalanced ["));
+      FETCH_WC (dfa, c, wc, _("unbalanced ["));
       invert = true;
-      known_bracket_exp = using_simple_locale ();
+      known_bracket_exp = using_simple_locale (dfa);
     }
   else
     invert = false;
@@ -1050,7 +1040,7 @@ parse_bracket_exp (void)
          dfa is ever called.  */
       if (c == '[')
         {
-          FETCH_WC (c1, wc1, _("unbalanced ["));
+          FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
 
           if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
               || c1 == '.' || c1 == '=')
@@ -1060,7 +1050,7 @@ parse_bracket_exp (void)
               size_t len = 0;
               for (;;)
                 {
-                  FETCH_WC (c, wc, _("unbalanced ["));
+                  FETCH_WC (dfa, c, wc, _("unbalanced ["));
                   if ((c == c1 && *lexptr == ']') || lexleft == 0)
                     break;
                   if (len < MAX_BRACKET_STRING_LEN)
@@ -1072,7 +1062,7 @@ parse_bracket_exp (void)
               str[len] = '\0';
 
               /* Fetch bracket.  */
-              FETCH_WC (c, wc, _("unbalanced ["));
+              FETCH_WC (dfa, c, wc, _("unbalanced ["));
               if (c1 == ':')
                 /* Build character class.  POSIX allows character
                    classes to match multicharacter collating elements,
@@ -1099,7 +1089,7 @@ parse_bracket_exp (void)
               colon_warning_state |= 8;
 
               /* Fetch new lookahead character.  */
-              FETCH_WC (c1, wc1, _("unbalanced ["));
+              FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
               continue;
             }
 
@@ -1108,15 +1098,15 @@ parse_bracket_exp (void)
         }
 
       if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-        FETCH_WC (c, wc, _("unbalanced ["));
+        FETCH_WC (dfa, c, wc, _("unbalanced ["));
 
       if (c1 == NOTCHAR)
-        FETCH_WC (c1, wc1, _("unbalanced ["));
+        FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
 
       if (c1 == '-')
         /* build range characters.  */
         {
-          FETCH_WC (c2, wc2, _("unbalanced ["));
+          FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
 
           /* A bracket expression like [a-[.aa.]] matches an unknown set.
              Treat it like [-a[.aa.]] while parsing it, and
@@ -1137,17 +1127,17 @@ parse_bracket_exp (void)
           else
             {
               if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-                FETCH_WC (c2, wc2, _("unbalanced ["));
+                FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
 
               colon_warning_state |= 8;
-              FETCH_WC (c1, wc1, _("unbalanced ["));
+              FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
 
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
                   if (dfa->multibyte)
                     known_bracket_exp = false;
-                  else if (using_simple_locale ())
+                  else if (using_simple_locale (dfa))
                     {
                       int ci;
                       for (ci = c; ci <= c2; ci++)
@@ -1214,7 +1204,7 @@ parse_bracket_exp (void)
   if (dfa->multibyte)
     {
       work_mbc->invert = invert;
-      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (ccl);
+      work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
       return MBCSET;
     }
 
@@ -1226,7 +1216,7 @@ parse_bracket_exp (void)
         clrbit ('\n', ccl);
     }
 
-  return CSET + charclass_index (ccl);
+  return CSET + dfa_charclass_index (dfa, ccl);
 }
 
 #define PUSH_LEX_STATE(s)			\
@@ -1244,7 +1234,7 @@ parse_bracket_exp (void)
   while (false)
 
 static token
-lex (void)
+lex (struct dfa *dfa)
 {
   int c, c2;
   bool backslash = false;
@@ -1259,7 +1249,7 @@ lex (void)
      "if (backslash) ...".  */
   for (i = 0; i < 2; ++i)
     {
-      FETCH_WC (c, wctok, NULL);
+      FETCH_WC (dfa, c, wctok, NULL);
 
       switch (c)
         {
@@ -1472,7 +1462,7 @@ lex (void)
           if (syntax_bits & RE_DOT_NOT_NULL)
             clrbit ('\0', ccl);
           laststart = false;
-          return lasttok = CSET + charclass_index (ccl);
+          return lasttok = CSET + dfa_charclass_index (dfa, ccl);
 
         case 's':
         case 'S':
@@ -1487,7 +1477,7 @@ lex (void)
               if (c == 'S')
                 notset (ccl);
               laststart = false;
-              return lasttok = CSET + charclass_index (ccl);
+              return lasttok = CSET + dfa_charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1498,7 +1488,7 @@ lex (void)
              strings, each minus its "already processed" '['.  */
           PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
 
-          lasttok = parse_bracket_exp ();
+          lasttok = parse_bracket_exp (dfa);
 
           POP_LEX_STATE ();
 
@@ -1519,7 +1509,7 @@ lex (void)
               if (c == 'W')
                 notset (ccl);
               laststart = false;
-              return lasttok = CSET + charclass_index (ccl);
+              return lasttok = CSET + dfa_charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1530,7 +1520,7 @@ lex (void)
              strings, each minus its "already processed" '['.  */
           PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
 
-          lasttok = parse_bracket_exp ();
+          lasttok = parse_bracket_exp (dfa);
 
           POP_LEX_STATE ();
 
@@ -1541,7 +1531,7 @@ lex (void)
           if (backslash)
             goto normal_char;
           laststart = false;
-          return lasttok = parse_bracket_exp ();
+          return lasttok = parse_bracket_exp (dfa);
 
         default:
         normal_char:
@@ -1555,7 +1545,7 @@ lex (void)
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
-              return lasttok = CSET + charclass_index (ccl);
+              return lasttok = CSET + dfa_charclass_index (dfa, ccl);
             }
 
           return lasttok = c;
@@ -1578,7 +1568,7 @@ static size_t depth;            /* Current depth of a hypothetical stack
                                    dfaanalyze.  */
 
 static void
-addtok_mb (token t, int mbprop)
+addtok_mb (struct dfa *dfa, token t, int mbprop)
 {
   if (dfa->talloc == dfa->tindex)
     {
@@ -1618,12 +1608,12 @@ addtok_mb (token t, int mbprop)
     dfa->depth = depth;
 }
 
-static void addtok_wc (wint_t wc);
+static void addtok_wc (struct dfa *dfa, wint_t wc);
 
 /* Add the given token to the parse tree, maintaining the depth count and
    updating the maximum depth if necessary.  */
 static void
-addtok (token t)
+addtok (struct dfa *dfa, token t)
 {
   if (dfa->multibyte && t == MBCSET)
     {
@@ -1635,9 +1625,9 @@ addtok (token t)
          This does not require UTF-8.  */
       for (i = 0; i < work_mbc->nchars; i++)
         {
-          addtok_wc (work_mbc->chars[i]);
+          addtok_wc (dfa, work_mbc->chars[i]);
           if (need_or)
-            addtok (OR);
+            addtok (dfa, OR);
           need_or = true;
         }
       work_mbc->nchars = 0;
@@ -1646,14 +1636,14 @@ addtok (token t)
          that the mbcset is empty now.  Do nothing in that case.  */
       if (work_mbc->cset != -1)
         {
-          addtok (CSET + work_mbc->cset);
+          addtok (dfa, CSET + work_mbc->cset);
           if (need_or)
-            addtok (OR);
+            addtok (dfa, OR);
         }
     }
   else
     {
-      addtok_mb (t, 3);
+      addtok_mb (dfa, t, 3);
     }
 }
 
@@ -1664,7 +1654,7 @@ addtok (token t)
    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
 static void
-addtok_wc (wint_t wc)
+addtok_wc (struct dfa *dfa, wint_t wc)
 {
   unsigned char buf[MB_LEN_MAX];
   mbstate_t s = { 0 };
@@ -1681,16 +1671,16 @@ addtok_wc (wint_t wc)
       buf[0] = 0;
     }
 
-  addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
+  addtok_mb (dfa, buf[0], cur_mb_len == 1 ? 3 : 1);
   for (i = 1; i < cur_mb_len; i++)
     {
-      addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
-      addtok (CAT);
+      addtok_mb (dfa, buf[i], i == cur_mb_len - 1 ? 2 : 0);
+      addtok (dfa, CAT);
     }
 }
 
 static void
-add_utf8_anychar (void)
+add_utf8_anychar (struct dfa *dfa)
 {
   static charclass const utf8_classes[5] = {
     /* 80-bf: non-leading bytes.  */
@@ -1724,7 +1714,7 @@ add_utf8_anychar (void)
             if (syntax_bits & RE_DOT_NOT_NULL)
               clrbit ('\0', c);
           }
-        dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
+        dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
       }
 
   /* A valid UTF-8 character is
@@ -1738,12 +1728,12 @@ add_utf8_anychar (void)
      and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
      Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
   for (i = 1; i < n; i++)
-    addtok (dfa->utf8_anychar_classes[i]);
+    addtok (dfa, dfa->utf8_anychar_classes[i]);
   while (--i > 1)
     {
-      addtok (dfa->utf8_anychar_classes[0]);
-      addtok (CAT);
-      addtok (OR);
+      addtok (dfa, dfa->utf8_anychar_classes[0]);
+      addtok (dfa, CAT);
+      addtok (dfa, OR);
     }
 }
 
@@ -1783,15 +1773,15 @@ add_utf8_anychar (void)
    The parser builds a parse tree in postfix form in an array of tokens.  */
 
 static void
-atom (void)
+atom (struct dfa *dfa)
 {
   if (tok == WCHAR)
     {
       if (wctok == WEOF)
-        addtok (BACKREF);
+        addtok (dfa, BACKREF);
       else
         {
-          addtok_wc (wctok);
+          addtok_wc (dfa, wctok);
 
           if (case_fold)
             {
@@ -1799,13 +1789,13 @@ atom (void)
               unsigned int i, n = case_folded_counterparts (wctok, folded);
               for (i = 0; i < n; i++)
                 {
-                  addtok_wc (folded[i]);
-                  addtok (OR);
+                  addtok_wc (dfa, folded[i]);
+                  addtok (dfa, OR);
                 }
             }
         }
 
-      tok = lex ();
+      tok = lex (dfa);
     }
   else if (tok == ANYCHAR && using_utf8 ())
     {
@@ -1816,32 +1806,32 @@ atom (void)
          it is done above in add_utf8_anychar.  So, let's start with
          UTF-8: it is the most used, and the structure of the encoding
          makes the correctness more obvious.  */
-      add_utf8_anychar ();
-      tok = lex ();
+      add_utf8_anychar (dfa);
+      tok = lex (dfa);
     }
   else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
            || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
            || tok == ANYCHAR || tok == MBCSET
            || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
     {
-      addtok (tok);
-      tok = lex ();
+      addtok (dfa, tok);
+      tok = lex (dfa);
     }
   else if (tok == LPAREN)
     {
-      tok = lex ();
-      regexp ();
+      tok = lex (dfa);
+      regexp (dfa);
       if (tok != RPAREN)
         dfaerror (_("unbalanced ("));
-      tok = lex ();
+      tok = lex (dfa);
     }
   else
-    addtok (EMPTY);
+    addtok (dfa, EMPTY);
 }
 
 /* Return the number of tokens in the given subexpression.  */
 static size_t _GL_ATTRIBUTE_PURE
-nsubtoks (size_t tindex)
+nsubtoks (struct dfa *dfa, size_t tindex)
 {
   size_t ntoks1;
 
@@ -1852,90 +1842,90 @@ nsubtoks (size_t tindex)
     case QMARK:
     case STAR:
     case PLUS:
-      return 1 + nsubtoks (tindex - 1);
+      return 1 + nsubtoks (dfa, tindex - 1);
     case CAT:
     case OR:
-      ntoks1 = nsubtoks (tindex - 1);
-      return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
+      ntoks1 = nsubtoks (dfa, tindex - 1);
+      return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
     }
 }
 
 /* Copy the given subexpression to the top of the tree.  */
 static void
-copytoks (size_t tindex, size_t ntokens)
+copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
 {
   size_t i;
 
   if (dfa->multibyte)
     for (i = 0; i < ntokens; ++i)
-      addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
+      addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
   else
     for (i = 0; i < ntokens; ++i)
-      addtok_mb (dfa->tokens[tindex + i], 3);
+      addtok_mb (dfa, dfa->tokens[tindex + i], 3);
 }
 
 static void
-closure (void)
+closure (struct dfa *dfa)
 {
   int i;
   size_t tindex, ntokens;
 
-  atom ();
+  atom (dfa);
   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
     if (tok == REPMN && (minrep || maxrep))
       {
-        ntokens = nsubtoks (dfa->tindex);
+        ntokens = nsubtoks (dfa, dfa->tindex);
         tindex = dfa->tindex - ntokens;
         if (maxrep < 0)
-          addtok (PLUS);
+          addtok (dfa, PLUS);
         if (minrep == 0)
-          addtok (QMARK);
+          addtok (dfa, QMARK);
         for (i = 1; i < minrep; ++i)
           {
-            copytoks (tindex, ntokens);
-            addtok (CAT);
+            copytoks (dfa, tindex, ntokens);
+            addtok (dfa, CAT);
           }
         for (; i < maxrep; ++i)
           {
-            copytoks (tindex, ntokens);
-            addtok (QMARK);
-            addtok (CAT);
+            copytoks (dfa, tindex, ntokens);
+            addtok (dfa, QMARK);
+            addtok (dfa, CAT);
           }
-        tok = lex ();
+        tok = lex (dfa);
       }
     else if (tok == REPMN)
       {
-        dfa->tindex -= nsubtoks (dfa->tindex);
-        tok = lex ();
-        closure ();
+        dfa->tindex -= nsubtoks (dfa, dfa->tindex);
+        tok = lex (dfa);
+        closure (dfa);
       }
     else
       {
-        addtok (tok);
-        tok = lex ();
+        addtok (dfa, tok);
+        tok = lex (dfa);
       }
 }
 
 static void
-branch (void)
+branch (struct dfa* dfa)
 {
-  closure ();
+  closure (dfa);
   while (tok != RPAREN && tok != OR && tok >= 0)
     {
-      closure ();
-      addtok (CAT);
+      closure (dfa);
+      addtok (dfa, CAT);
     }
 }
 
 static void
-regexp (void)
+regexp (struct dfa *dfa)
 {
-  branch ();
+  branch (dfa);
   while (tok == OR)
     {
-      tok = lex ();
-      branch ();
-      addtok (OR);
+      tok = lex (dfa);
+      branch (dfa);
+      addtok (dfa, OR);
     }
 }
 
@@ -1945,13 +1935,12 @@ regexp (void)
 static void
 dfaparse (char const *s, size_t len, struct dfa *d)
 {
-  dfa = d;
   lexptr = s;
   lexleft = len;
   lasttok = END;
   laststart = true;
   parens = 0;
-  if (dfa->multibyte)
+  if (d->multibyte)
     {
       cur_mb_len = 0;
       memset (&d->mbs, 0, sizeof d->mbs);
@@ -1960,19 +1949,19 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   if (!syntax_bits_set)
     dfaerror (_("no syntax specified"));
 
-  tok = lex ();
+  tok = lex (d);
   depth = d->depth;
 
-  regexp ();
+  regexp (d);
 
   if (tok != END)
     dfaerror (_("unbalanced )"));
 
-  addtok (END - d->nregexps);
-  addtok (CAT);
+  addtok (d, END - d->nregexps);
+  addtok (d, CAT);
 
   if (d->nregexps)
-    addtok (OR);
+    addtok (d, OR);
 
   ++d->nregexps;
 }
-- 
2.8.1

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Thu, 18 Aug 2016 14:48:01 GMT) Full text and rfc822 format available.

Message #8 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Jim Meyering <jim <at> meyering.net>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Thu, 18 Aug 2016 07:46:53 -0700

On Thu, Aug 18, 2016 at 3:50 AM, Zev Weiss <zev <at> bewilderbeest.net> wrote:
> * src/dfa.c: remove global dfa struct.  A pointer to a struct dfa is
> instead added as a parameter to the functions that had been using the
> global.

Thank you.
At first glance, I like it, but saw no "const" attribute on any added
parameter declaration.
Surely some of those can be "const".  Would you make all new dfa
parameters const that can be?

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 16:10:01 GMT) Full text and rfc822 format available.

Message #11 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Sat, 20 Aug 2016 01:09:20 +0900

On Thu, 18 Aug 2016 05:50:14 -0500
Zev Weiss <zev <at> bewilderbeest.net> wrote:

> * src/dfa.c: remove global dfa struct.  A pointer to a struct dfa is
> instead added as a parameter to the functions that had been using the
> global.

Hi,

Why we move global variable DFA into struct dfa, Although only used in
dfacomp() which I seem that does not support multithread even after
applying your patches?  Variables for syntax (lexptr etc.), like.

Thanks,
Norihiro

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 16:39:01 GMT) Full text and rfc822 format available.

Message #14 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Jim Meyering <jim <at> meyering.net>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Fri, 19 Aug 2016 09:37:54 -0700

[Message part 1 (text/plain, inline)]

On Thu, Aug 18, 2016 at 7:46 AM, Jim Meyering <jim <at> meyering.net> wrote:
> On Thu, Aug 18, 2016 at 3:50 AM, Zev Weiss <zev <at> bewilderbeest.net> wrote:
>> * src/dfa.c: remove global dfa struct.  A pointer to a struct dfa is
>> instead added as a parameter to the functions that had been using the
>> global.
>
> Thank you.
> At first glance, I like it, but saw no "const" attribute on any added
> parameter declaration.
> Surely some of those can be "const".  Would you make all new dfa
> parameters const that can be?

Hi Zev,

I went ahead and did that for you.
There were only two: using_simple_locale and nsubtoks.
I amended your patch to make those "dfa" parameters "const"
and adjusted the commit message to conform to our standards:

  - use active voice, not passive
  - enumerate each affected variable/function

Yes, it is tedious, but for a project as mature as grep, it is
essential. You would not believe the number of times that having the
ability to cross-check the prose of a ChangeLog entry with the actual
change has led to pre-commit improvements.

Now, since I've changed a commit in your name that I expect to push,
it is your job to proof-read it to make sure you're still happy to
have your name on this modified commit. We treat our git repository as
write-once. No non-fast-forward pushes on master, so once pushed, it's
there forever. Let me know.

[dfa-remove-file-scoped-global.diff (text/plain, attachment)]

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 21:47:02 GMT) Full text and rfc822 format available.

Message #17 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Zev Weiss <zev <at> bewilderbeest.net>
To: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global
 in dfa.c
Date: Fri, 19 Aug 2016 16:46:16 -0500

On Sat, Aug 20, 2016 at 01:09:20AM +0900, Norihiro Tanaka wrote:
>
>On Thu, 18 Aug 2016 05:50:14 -0500
>Zev Weiss <zev <at> bewilderbeest.net> wrote:
>
>> * src/dfa.c: remove global dfa struct.  A pointer to a struct dfa is
>> instead added as a parameter to the functions that had been using the
>> global.
>
>Hi,
>
>Why we move global variable DFA into struct dfa, Although only used in
>dfacomp() which I seem that does not support multithread even after
>applying your patches?  Variables for syntax (lexptr etc.), like.
>
>Thanks,
>Norihiro
>

I'm not sure I understand -- the first patch in my series just removes 
the global dfa variable and instead passes it as a parameter.  This 
alone doesn't make the whole thing thread-safe, it's just a first step 
along the way.  With the whole series applied though I believe 
everything should be thread-safe (for example, lexptr is moved into a 
sub-struct of struct dfa in the second patch in the series).

Is there some non-thread-safe code that would remain after applying all 
six patches in the series?

Thanks,
Zev

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 21:55:01 GMT) Full text and rfc822 format available.

Message #20 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Zev Weiss <zev <at> bewilderbeest.net>
To: Jim Meyering <jim <at> meyering.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global
 in dfa.c
Date: Fri, 19 Aug 2016 16:54:37 -0500

On Fri, Aug 19, 2016 at 09:37:54AM -0700, Jim Meyering wrote:
>On Thu, Aug 18, 2016 at 7:46 AM, Jim Meyering <jim <at> meyering.net> wrote:
>> On Thu, Aug 18, 2016 at 3:50 AM, Zev Weiss <zev <at> bewilderbeest.net> wrote:
>>> * src/dfa.c: remove global dfa struct.  A pointer to a struct dfa is
>>> instead added as a parameter to the functions that had been using the
>>> global.
>>
>> Thank you.
>> At first glance, I like it, but saw no "const" attribute on any added
>> parameter declaration.
>> Surely some of those can be "const".  Would you make all new dfa
>> parameters const that can be?
>
>Hi Zev,
>
>I went ahead and did that for you.
>There were only two: using_simple_locale and nsubtoks.
>I amended your patch to make those "dfa" parameters "const"
>and adjusted the commit message to conform to our standards:
>
>  - use active voice, not passive
>  - enumerate each affected variable/function
>
>Yes, it is tedious, but for a project as mature as grep, it is
>essential. You would not believe the number of times that having the
>ability to cross-check the prose of a ChangeLog entry with the actual
>change has led to pre-commit improvements.
>
>Now, since I've changed a commit in your name that I expect to push,
>it is your job to proof-read it to make sure you're still happy to
>have your name on this modified commit. We treat our git repository as
>write-once. No non-fast-forward pushes on master, so once pushed, it's
>there forever. Let me know.

Hi Jim,

The amended version of the patch looks fine to me, thanks for checking.

Zev

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 22:26:02 GMT) Full text and rfc822 format available.

Message #23 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Sat, 20 Aug 2016 07:25:06 +0900

On Fri, 19 Aug 2016 16:46:16 -0500
Zev Weiss <zev <at> bewilderbeest.net> wrote:

> I'm not sure I understand -- the first patch in my series just removes the global dfa variable and instead passes it as a parameter.  This alone doesn't make the whole thing thread-safe, it's just a first step along the way.  With the whole series applied though I believe everything should be thread-safe (for example, lexptr is moved into a sub-struct of struct dfa in the second patch in the series).
> 
> Is there some non-thread-safe code that would remain after applying all six patches in the series?

Hi Zev,

Thanks for replying.  I say a reverse thing.

I believe that there is no problem if only dfaexec() is thread safe.  In
other words, I think that variables that we must really move to support
multipthread are eolbyte, sbit and letters, newline only which depend
on eolbyte, and mbrtowc_cache[] and never_trail should be changed as the
value does not changed by other thread at running dfaexec() by marking
already set, and we can leave other static and global as syntax etc.
It will bring us to the saving of memory albeit only slightly.

Thanks,
Norihiro

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 22:35:02 GMT) Full text and rfc822 format available.

Message #26 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Sat, 20 Aug 2016 07:34:20 +0900

On Sat, 20 Aug 2016 07:25:06 +0900
Norihiro Tanaka <noritnk <at> kcn.ne.jp> wrote:
> Hi Zev,
> 
> Thanks for replying.  I say a reverse thing.
> 
> I believe that there is no problem if only dfaexec() is thread safe.  In
> other words, I think that variables that we must really move to support
> multipthread are eolbyte, sbit and letters, newline only which depend
> on eolbyte, and mbrtowc_cache[] and never_trail should be changed as the
> value does not changed by other thread at running dfaexec() by marking
> already set, and we can leave other static and global as syntax etc.
> It will bring us to the saving of memory albeit only slightly.

Ah, I replied for first patch, but my all comments are for second or
later patches.  I agree for first patch entirely regardless of whether
we support multithread or not.

Reply sent to Jim Meyering <jim <at> meyering.net>:
You have taken responsibility. (Fri, 19 Aug 2016 22:54:02 GMT) Full text and rfc822 format available.

Notification sent to Zev Weiss <zev <at> bewilderbeest.net>:
bug acknowledged by developer. (Fri, 19 Aug 2016 22:54:02 GMT) Full text and rfc822 format available.

Message #31 received at 24260-done <at> debbugs.gnu.org (full text, mbox):

From: Jim Meyering <jim <at> meyering.net>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260-done <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Fri, 19 Aug 2016 15:53:12 -0700

On Fri, Aug 19, 2016 at 2:54 PM, Zev Weiss <zev <at> bewilderbeest.net> wrote:
...
> The amended version of the patch looks fine to me, thanks for checking.

Pushed.

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Fri, 19 Aug 2016 23:04:01 GMT) Full text and rfc822 format available.

Message #34 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Zev Weiss <zev <at> bewilderbeest.net>
To: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global
 in dfa.c
Date: Fri, 19 Aug 2016 18:03:19 -0500

On Sat, Aug 20, 2016 at 07:34:20AM +0900, Norihiro Tanaka wrote:
>
>On Sat, 20 Aug 2016 07:25:06 +0900
>Norihiro Tanaka <noritnk <at> kcn.ne.jp> wrote:
>> Hi Zev,
>>
>> Thanks for replying.  I say a reverse thing.
>>
>> I believe that there is no problem if only dfaexec() is thread safe.  In
>> other words, I think that variables that we must really move to support
>> multipthread are eolbyte, sbit and letters, newline only which depend
>> on eolbyte, and mbrtowc_cache[] and never_trail should be changed as the
>> value does not changed by other thread at running dfaexec() by marking
>> already set, and we can leave other static and global as syntax etc.
>> It will bring us to the saving of memory albeit only slightly.
>
>Ah, I replied for first patch, but my all comments are for second or
>later patches.  I agree for first patch entirely regardless of whether
>we support multithread or not.
>

Okay -- so your question is about the necessity of making operations 
other than dfaexec() thread-safe?  That's reasonable, though (obviously) 
I went ahead made the other operations thread-safe anyway.

1) It was, in some ways, simpler for me.  Rather than tracing through 
exactly when and how each variable was used to determine which could be 
safely left as globals, it was somewhat easier to just migrate all 
mutable global state into a context struct of some sort.

2) While it's not a terribly elegant approach (and certainly doesn't 
have to be this way), my current multithreading patch set actually does 
compile the regex separately (and concurrently) in each thread (for 
reasons similar to #1 above -- it was just simpler to do it that way).

3) Finally, and perhaps most compellingly in my opinion, especially 
since dfa.c is soon going to become a library component, it seems 
desirable for compilation etc. to be made thread safe anyway.  While 
grep doesn't really *need* to have it be thread-safe (and I'm guessing 
gawk and sed currently don't either), it doesn't seem all that 
unreasonable for an application to *want* to concurrently compile 
independent regexes, so it would seem a bit unfortunate to me for a 
library (gnulib) to not support that at all.

Zev

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Sat, 20 Aug 2016 23:42:02 GMT) Full text and rfc822 format available.

Message #37 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
To: Zev Weiss <zev <at> bewilderbeest.net>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in
 dfa.c
Date: Sun, 21 Aug 2016 08:40:59 +0900

On Fri, 19 Aug 2016 18:03:19 -0500
Zev Weiss <zev <at> bewilderbeest.net> wrote:

> Okay -- so your question is about the necessity of making operations other than dfaexec() thread-safe?  That's reasonable, though (obviously) I went ahead made the other operations thread-safe anyway.
> 
> 1) It was, in some ways, simpler for me.  Rather than tracing through exactly when and how each variable was used to determine which could be safely left as globals, it was somewhat easier to just migrate all mutable global state into a context struct of some sort.
> 
> 2) While it's not a terribly elegant approach (and certainly doesn't have to be this way), my current multithreading patch set actually does compile the regex separately (and concurrently) in each thread (for reasons similar to #1 above -- it was just simpler to do it that way).
> 
> 3) Finally, and perhaps most compellingly in my opinion, especially since dfa.c is soon going to become a library component, it seems desirable for compilation etc. to be made thread safe anyway.  While grep doesn't really *need* to have it be thread-safe (and I'm guessing gawk and sed currently don't either), it doesn't seem all that unreasonable for an application to *want* to concurrently compile independent regexes, so it would seem a bit unfortunate to me for a library (gnulib) to not support that at all.
> 
> 
> Zev

Thanks.  dfa exists only to improve performance of regex and regex uses
static variable for syntax which is re_syntax_options.  So we can not
still change compile phase thread-safe.

In addition, IMHO, I think that compile should run only one time even
if grep runs multithread as you say in 2, and I do not expect that
members used in only compile not used in execute is generated each more
than one.

BTW, mbcsets, nmbcsets and mbcsets_alloc use in only compile, so I think
that they may be also put out of struct dfa in future. (That isn't always
to move them to static or global)

Norihiro

Information forwarded to bug-grep <at> gnu.org:
bug#24260; Package grep. (Sun, 21 Aug 2016 01:22:02 GMT) Full text and rfc822 format available.

Message #40 received at 24260 <at> debbugs.gnu.org (full text, mbox):

From: Zev Weiss <zev <at> bewilderbeest.net>
To: Norihiro Tanaka <noritnk <at> kcn.ne.jp>
Cc: 24260 <at> debbugs.gnu.org
Subject: Re: bug#24260: [PATCH 1/6] dfa: thread-safety: remove 'dfa' global
 in dfa.c
Date: Sat, 20 Aug 2016 20:21:33 -0500

On Sun, Aug 21, 2016 at 08:40:59AM +0900, Norihiro Tanaka wrote:
>
>On Fri, 19 Aug 2016 18:03:19 -0500
>Zev Weiss <zev <at> bewilderbeest.net> wrote:
>
>> Okay -- so your question is about the necessity of making operations other than dfaexec() thread-safe?  That's reasonable, though (obviously) I went ahead made the other operations thread-safe anyway.
>>
>> 1) It was, in some ways, simpler for me.  Rather than tracing through exactly when and how each variable was used to determine which could be safely left as globals, it was somewhat easier to just migrate all mutable global state into a context struct of some sort.
>>
>> 2) While it's not a terribly elegant approach (and certainly doesn't have to be this way), my current multithreading patch set actually does compile the regex separately (and concurrently) in each thread (for reasons similar to #1 above -- it was just simpler to do it that way).
>>
>> 3) Finally, and perhaps most compellingly in my opinion, especially since dfa.c is soon going to become a library component, it seems desirable for compilation etc. to be made thread safe anyway.  While grep doesn't really *need* to have it be thread-safe (and I'm guessing gawk and sed currently don't either), it doesn't seem all that unreasonable for an application to *want* to concurrently compile independent regexes, so it would seem a bit unfortunate to me for a library (gnulib) to not support that at all.
>>
>>
>> Zev
>
>Thanks.  dfa exists only to improve performance of regex and regex uses
>static variable for syntax which is re_syntax_options.  So we can not
>still change compile phase thread-safe.

Hmm, I hadn't noticed that previously -- thanks.

>
>In addition, IMHO, I think that compile should run only one time even
>if grep runs multithread as you say in 2, and I do not expect that
>members used in only compile not used in execute is generated each more
>than one.
>

Yes, certainly it would be preferable for a multithreaded grep to 
compile the regex once regardless of the thread count.  I've been 
meaning to implement it that way in my patch set, but haven't quite 
gotten around to it yet...hopefully soon.


Zev

bug archived. Request was from Debbugs Internal Request <help-debbugs <at> gnu.org> to internal_control <at> debbugs.gnu.org. (Sun, 18 Sep 2016 11:24:03 GMT) Full text and rfc822 format available.

This bug report was last modified 7 years and 234 days ago.

Previous Next

GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.

GNU bug report logs - #24260 [PATCH 1/6] dfa: thread-safety: remove 'dfa' global in dfa.c

GNU bug report logs - #24260
[PATCH 1/6] dfa: thread-safety: remove 'dfa' global in dfa.c