From e4aaa8a9947d951eb1e69979c3c58e197adbe40f Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Thu, 25 Apr 2024 16:45:46 -0700
Subject: [PATCH] Fix incorrect f-string tokenization (#4332)

---
 CHANGES.md                     |   2 +
 src/blib2to3/pgen2/tokenize.py | 163 ++++++++++++++++++++++++---------
 tests/data/cases/pep_701.py    |   8 ++
 3 files changed, 130 insertions(+), 43 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 79e7b0b1444..17decf5fe05 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -26,6 +26,8 @@
 
 <!-- Changes to the parser or to version autodetection -->
 
+- Fix regression where certain complex f-strings failed to parse (#4332)
+
 ### Performance
 
 <!-- Changes that improve Black's performance. -->
diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py
index fd0b5564f43..f66087bd0e3 100644
--- a/src/blib2to3/pgen2/tokenize.py
+++ b/src/blib2to3/pgen2/tokenize.py
@@ -480,6 +480,88 @@ def _split_fstring_start_and_middle(token: str) -> Tuple[str, str]:
     raise ValueError(f"Token {token!r} is not a valid f-string start")
 
 
+STATE_NOT_FSTRING: Final = 0  # not in an f-string
+STATE_MIDDLE: Final = 1  # in the string portion of an f-string (outside braces)
+STATE_IN_BRACES: Final = 2  # between braces in an f-string
+# in the format specifier (between the colon and the closing brace)
+STATE_IN_COLON: Final = 3
+
+
+class FStringState:
+    """Keeps track of state around f-strings.
+
+    The tokenizer should call the appropriate method on this class when
+    it transitions to a different part of an f-string. This is needed
+    because the tokenization depends on knowing where exactly we are in
+    the f-string.
+
+    For example, consider the following f-string:
+
+        f"a{1:b{2}c}d"
+
+    The following is the tokenization of this string and the states
+    tracked by this class:
+
+        1,0-1,2:	FSTRING_START	'f"'  # [STATE_NOT_FSTRING, STATE_MIDDLE]
+        1,2-1,3:	FSTRING_MIDDLE	'a'
+        1,3-1,4:	LBRACE	'{'  # [STATE_NOT_FSTRING, STATE_IN_BRACES]
+        1,4-1,5:	NUMBER	'1'
+        1,5-1,6:	OP	':'  # [STATE_NOT_FSTRING, STATE_IN_COLON]
+        1,6-1,7:	FSTRING_MIDDLE	'b'
+        1,7-1,8:	LBRACE	'{'  # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES]
+        1,8-1,9:	NUMBER	'2'
+        1,9-1,10:	RBRACE	'}'  # [STATE_NOT_FSTRING, STATE_IN_COLON]
+        1,10-1,11:	FSTRING_MIDDLE	'c'
+        1,11-1,12:	RBRACE	'}'  # [STATE_NOT_FSTRING, STATE_MIDDLE]
+        1,12-1,13:	FSTRING_MIDDLE	'd'
+        1,13-1,14:	FSTRING_END	'"'  # [STATE_NOT_FSTRING]
+        1,14-1,15:	NEWLINE	'\n'
+        2,0-2,0:	ENDMARKER	''
+
+    Notice that the nested braces in the format specifier are represented
+    by adding a STATE_IN_BRACES entry to the state stack. The stack is
+    also used if there are nested f-strings.
+
+    """
+
+    def __init__(self) -> None:
+        self.stack: List[int] = [STATE_NOT_FSTRING]
+
+    def is_in_fstring_expression(self) -> bool:
+        return self.stack[-1] not in (STATE_MIDDLE, STATE_NOT_FSTRING)
+
+    def current(self) -> int:
+        return self.stack[-1]
+
+    def enter_fstring(self) -> None:
+        self.stack.append(STATE_MIDDLE)
+
+    def leave_fstring(self) -> None:
+        state = self.stack.pop()
+        assert state == STATE_MIDDLE
+
+    def consume_lbrace(self) -> None:
+        current_state = self.stack[-1]
+        if current_state == STATE_MIDDLE:
+            self.stack[-1] = STATE_IN_BRACES
+        elif current_state == STATE_IN_COLON:
+            self.stack.append(STATE_IN_BRACES)
+        else:
+            assert False, current_state
+
+    def consume_rbrace(self) -> None:
+        current_state = self.stack[-1]
+        assert current_state in (STATE_IN_BRACES, STATE_IN_COLON)
+        if len(self.stack) > 1 and self.stack[-2] == STATE_IN_COLON:
+            self.stack.pop()
+        else:
+            self.stack[-1] = STATE_MIDDLE
+
+    def consume_colon(self) -> None:
+        assert self.stack[-1] == STATE_IN_BRACES, self.stack
+        self.stack[-1] = STATE_IN_COLON
+
+
 def generate_tokens(
     readline: Callable[[], str], grammar: Optional[Grammar] = None
 ) -> Iterator[GoodTokenInfo]:
@@ -498,12 +580,10 @@ def generate_tokens(
     and the line on which the token was found. The line passed is the
     logical line; continuation lines are included.
     """
-    lnum = parenlev = fstring_level = continued = 0
+    lnum = parenlev = continued = 0
     parenlev_stack: List[int] = []
-    inside_fstring_braces = False
-    inside_fstring_colon = False
+    fstring_state = FStringState()
     formatspec = ""
-    bracelev = 0
     numchars: Final[str] = "0123456789"
     contstr, needcont = "", 0
     contline: Optional[str] = None
@@ -542,13 +622,15 @@ def generate_tokens(
                 spos = strstart
                 epos = (lnum, end)
                 tokenline = contline + line
-                if fstring_level == 0 and not is_fstring_start(token):
+                if (
+                    fstring_state.current() == STATE_NOT_FSTRING
+                    and not is_fstring_start(token)
+                ):
                     yield (STRING, token, spos, epos, tokenline)
                     endprog_stack.pop()
                     parenlev = parenlev_stack.pop()
                 else:
                     if is_fstring_start(token):
-                        fstring_level += 1
                         fstring_start, token = _split_fstring_start_and_middle(token)
                         fstring_start_epos = (lnum, spos[1] + len(fstring_start))
                         yield (
@@ -558,6 +640,7 @@ def generate_tokens(
                             fstring_start_epos,
                             tokenline,
                         )
+                        fstring_state.enter_fstring()
                         # increase spos to the end of the fstring start
                         spos = fstring_start_epos
 
@@ -572,7 +655,7 @@ def generate_tokens(
                             line,
                         )
                         yield (LBRACE, lbrace, lbrace_spos, epos, line)
-                        inside_fstring_braces = True
+                        fstring_state.consume_lbrace()
                     else:
                         if token.endswith(('"""', "'''")):
                             fstring_middle, fstring_end = token[:-3], token[-3:]
@@ -594,11 +677,9 @@ def generate_tokens(
                             epos,
                             line,
                         )
-                        fstring_level -= 1
+                        fstring_state.leave_fstring()
                         endprog_stack.pop()
                         parenlev = parenlev_stack.pop()
-                        if fstring_level > 0:
-                            inside_fstring_braces = True
                 pos = end
                 contstr, needcont = "", 0
                 contline = None
@@ -619,7 +700,11 @@ def generate_tokens(
                 continue
 
         # new statement
-        elif parenlev == 0 and not continued and not inside_fstring_braces:
+        elif (
+            parenlev == 0
+            and not continued
+            and not fstring_state.is_in_fstring_expression()
+        ):
             if not line:
                 break
             column = 0
@@ -687,7 +772,7 @@ def generate_tokens(
             continued = 0
 
         while pos < max:
-            if fstring_level > 0 and not inside_fstring_braces:
+            if fstring_state.current() == STATE_MIDDLE:
                 endprog = endprog_stack[-1]
                 endmatch = endprog.match(line, pos)
                 if endmatch:  # all on one line
@@ -718,14 +803,12 @@ def generate_tokens(
                             (lnum, end),
                             line,
                         )
-                        fstring_level -= 1
+                        fstring_state.leave_fstring()
                         endprog_stack.pop()
                         parenlev = parenlev_stack.pop()
-                        if fstring_level > 0:
-                            inside_fstring_braces = True
                     else:
                         yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line)
-                        inside_fstring_braces = True
+                        fstring_state.consume_lbrace()
                     pos = end
                     continue
                 else:  # multiple lines
@@ -734,7 +817,7 @@ def generate_tokens(
                     contline = line
                     break
 
-            if inside_fstring_colon:
+            if fstring_state.current() == STATE_IN_COLON:
                 match = fstring_middle_after_colon.match(line, pos)
                 if match is None:
                     formatspec += line[pos:]
@@ -754,15 +837,19 @@ def generate_tokens(
                 formatspec = ""
 
                 if brace_or_nl == "{":
-                    yield (OP, "{", (lnum, brace_start), (lnum, brace_end), line)
-                    bracelev += 1
+                    yield (LBRACE, "{", (lnum, brace_start), (lnum, brace_end), line)
+                    fstring_state.consume_lbrace()
+                    end = brace_end
+                elif brace_or_nl == "}":
+                    yield (RBRACE, "}", (lnum, brace_start), (lnum, brace_end), line)
+                    fstring_state.consume_rbrace()
                     end = brace_end
+                    formatspec_start = (lnum, brace_end)
 
-                inside_fstring_colon = False
                 pos = end
                 continue
 
-            if inside_fstring_braces and parenlev == 0:
+            if fstring_state.current() == STATE_IN_BRACES and parenlev == 0:
                 match = bang.match(line, pos)
                 if match:
                     start, end = match.span(1)
@@ -774,7 +861,7 @@ def generate_tokens(
                 if match:
                     start, end = match.span(1)
                     yield (OP, ":", (lnum, start), (lnum, end), line)
-                    inside_fstring_colon = True
+                    fstring_state.consume_colon()
                     formatspec_start = (lnum, end)
                     pos = end
                     continue
@@ -791,7 +878,7 @@ def generate_tokens(
                     yield (NUMBER, token, spos, epos, line)
                 elif initial in "\r\n":
                     newline = NEWLINE
-                    if parenlev > 0 or inside_fstring_braces:
+                    if parenlev > 0 or fstring_state.is_in_fstring_expression():
                         newline = NL
                     elif async_def:
                         async_def_nl = True
@@ -813,7 +900,7 @@ def generate_tokens(
                     parenlev = 0
                     if is_fstring_start(token):
                         yield (FSTRING_START, token, spos, epos, line)
-                        fstring_level += 1
+                        fstring_state.enter_fstring()
 
                     endmatch = endprog.match(line, pos)
                     if endmatch:  # all on one line
@@ -848,11 +935,9 @@ def generate_tokens(
                                     epos,
                                     line,
                                 )
-                                fstring_level -= 1
+                                fstring_state.leave_fstring()
                                 endprog_stack.pop()
                                 parenlev = parenlev_stack.pop()
-                                if fstring_level > 0:
-                                    inside_fstring_braces = True
                             else:
                                 fstring_middle, lbrace = token[:-1], token[-1]
                                 fstring_middle_epos = lbrace_spos = (lnum, end - 1)
@@ -864,7 +949,7 @@ def generate_tokens(
                                     line,
                                 )
                                 yield (LBRACE, lbrace, lbrace_spos, epos, line)
-                                inside_fstring_braces = True
+                                fstring_state.consume_lbrace()
                             pos = end
                     else:
                         # multiple lines
@@ -919,7 +1004,7 @@ def generate_tokens(
 
                             start_epos = (lnum, start + offset)
                             yield (FSTRING_START, fstring_start, spos, start_epos, line)
-                            fstring_level += 1
+                            fstring_state.enter_fstring()
                             endprog = endprogs[fstring_start]
                             endprog_stack.append(endprog)
                             parenlev_stack.append(parenlev)
@@ -940,16 +1025,14 @@ def generate_tokens(
                                 end_spos = (lnum, end_offset)
                                 end_epos = (lnum, end_offset + 1)
                                 yield (FSTRING_END, token[-1], end_spos, end_epos, line)
-                                fstring_level -= 1
+                                fstring_state.leave_fstring()
                                 endprog_stack.pop()
                                 parenlev = parenlev_stack.pop()
-                                if fstring_level > 0:
-                                    inside_fstring_braces = True
                             else:
                                 end_spos = (lnum, end_offset)
                                 end_epos = (lnum, end_offset + 1)
                                 yield (LBRACE, "{", end_spos, end_epos, line)
-                                inside_fstring_braces = True
+                                fstring_state.consume_lbrace()
 
                 elif initial.isidentifier():  # ordinary name
                     if token in ("async", "await"):
@@ -998,19 +1081,13 @@ def generate_tokens(
                 elif (
                     initial == "}"
                     and parenlev == 0
-                    and bracelev == 0
-                    and fstring_level > 0
+                    and fstring_state.is_in_fstring_expression()
                 ):
                     yield (RBRACE, token, spos, epos, line)
-                    inside_fstring_braces = False
+                    fstring_state.consume_rbrace()
+                    formatspec_start = epos
                 else:
-                    if parenlev == 0 and bracelev > 0 and initial == "}":
-                        bracelev -= 1
-                        # if we're still inside fstrings, we're still part of the format spec
-                        if inside_fstring_braces:
-                            inside_fstring_colon = True
-                            formatspec_start = (lnum, pos)
-                    elif initial in "([{":
+                    if initial in "([{":
                         parenlev += 1
                     elif initial in ")]}":
                         parenlev -= 1
diff --git a/tests/data/cases/pep_701.py b/tests/data/cases/pep_701.py
index a0b67413ee9..9e157bd6fd1 100644
--- a/tests/data/cases/pep_701.py
+++ b/tests/data/cases/pep_701.py
@@ -119,6 +119,10 @@
     level=0,
 )
 
+f"{1:{f'{2}'}}"
+f'{1:{f'{2}'}}'
+f'{1:{2}d}'
+
 f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'
 
 # output
@@ -243,4 +247,8 @@
     level=0,
 )
 
+f"{1:{f'{2}'}}"
+f"{1:{f'{2}'}}"
+f"{1:{2}d}"
+
 f'{{\\"kind\\":\\"ConfigMap\\",\\"metadata\\":{{\\"annotations\\":{{}},\\"name\\":\\"cluster-info\\",\\"namespace\\":\\"amazon-cloudwatch\\"}}}}'