never executed always true always false
    1 {-# LANGUAGE DeriveAnyClass #-}
    2 {-# LANGUAGE DeriveGeneric #-}
    3 {-# LANGUAGE OverloadedStrings #-}
    4 
    5 -- |
    6 -- Module      : Aihc.Parser.Lex
    7 -- Description : Lex Haskell source into span-annotated tokens with inline extension handling
    8 --
    9 -- This module performs the pre-parse tokenization step for Haskell source code.
   10 -- It turns raw text into 'LexToken's that preserve:
   11 --
   12 -- * a semantic token classification ('LexTokenKind')
   13 -- * the original token text ('lexTokenText')
   14 -- * source location information ('lexTokenSpan')
   15 --
   16 -- The lexer runs in two phases:
   17 --
   18 -- 1. /Raw tokenization/ with a custom incremental scanner that consumes one or more
   19 --    input chunks and emits tokens lazily. Extension-specific lexing (such as
   20 --    @NegativeLiterals@ and @LexicalNegation@) is handled inline during this phase
   21 --    by tracking the previous token context.
   22 -- 2. /Layout insertion/ ('applyLayoutTokens') that inserts virtual @{@, @;@ and @}@
   23 --    according to indentation (the offside rule), so the parser can treat implicit
   24 --    layout like explicit braces and semicolons.
   25 --
   26 -- Scanning is incremental and error-tolerant:
   27 --
   28 -- * token production starts as soon as enough input is available
   29 -- * malformed lexemes produce 'TkError' tokens instead of aborting lexing
   30 -- * @# ...@, @#line ...@, @{-# LINE #-}@ and @{-# COLUMN #-}@ are handled in-band by
   31 --   the lexer and update subsequent token spans without being exposed as normal tokens
   32 --
   33 -- Layout-sensitive syntax is the tricky part. The implementation tracks a stack of
   34 -- layout contexts and mirrors the @haskell-src-exts@ model summarized in
   35 -- @docs/hse-indentation-layout.md@:
   36 --
   37 -- * after layout-introducing keywords (currently @do@, @of@, @let@, @where@, @\\case@, plus optional module
   38 --   body layout), mark a pending implicit block
   39 -- * if the next token is an explicit @{@, disable implicit insertion for that block
   40 -- * otherwise, open an implicit layout context at the next token column
   41 -- * at beginning-of-line tokens, dedent emits virtual @}@, equal-indent emits virtual
   42 --   @;@ (with a small suppression rule for @then@/@else@)
   43 --
   44 -- Keyword classification is intentionally lexical and exact. 'lexIdentifier'
   45 -- produces a keyword token /only/ when the full identifier text exactly matches a
   46 -- reserved word in 'keywordTokenKind'. That means:
   47 --
   48 -- * @where@ becomes 'TkKeywordWhere'
   49 -- * @where'@, @_where@, and @M.where@ remain identifiers
   50 --
   51 -- In other words, use keyword tokens only for exact reserved lexemes; contextual
   52 -- validity is left to the parser.
   53 module Aihc.Parser.Lex
   54   ( LexToken (..),
   55     LexTokenKind (..),
   56     isReservedIdentifier,
   57     readModuleHeaderExtensions,
   58     readModuleHeaderExtensionsFromChunks,
   59     lexTokensFromChunks,
   60     lexModuleTokensFromChunks,
   61     lexTokensWithExtensions,
   62     lexModuleTokensWithExtensions,
   63     lexTokens,
   64     lexModuleTokens,
   65   )
   66 where
   67 
   68 import Aihc.Parser.Syntax
   69 import Control.DeepSeq (NFData)
   70 import Data.Char (digitToInt, isAlphaNum, isAsciiLower, isAsciiUpper, isDigit, isHexDigit, isOctDigit, isSpace)
   71 import Data.List qualified as List
   72 import Data.Maybe (fromMaybe, isJust, mapMaybe)
   73 import Data.Text (Text)
   74 import Data.Text qualified as T
   75 import GHC.Generics (Generic)
   76 import Numeric (readHex, readInt, readOct)
   77 
   78 data LexTokenKind
   79   = -- Keywords (reserved identifiers per Haskell Report Section 2.4)
   80     TkKeywordCase
   81   | TkKeywordClass
   82   | TkKeywordData
   83   | TkKeywordDefault
   84   | TkKeywordDeriving
   85   | TkKeywordDo
   86   | TkKeywordElse
   87   | TkKeywordForeign
   88   | TkKeywordIf
   89   | TkKeywordImport
   90   | TkKeywordIn
   91   | TkKeywordInfix
   92   | TkKeywordInfixl
   93   | TkKeywordInfixr
   94   | TkKeywordInstance
   95   | TkKeywordLet
   96   | TkKeywordModule
   97   | TkKeywordNewtype
   98   | TkKeywordOf
   99   | TkKeywordThen
  100   | TkKeywordType
  101   | TkKeywordWhere
  102   | TkKeywordUnderscore -- _ (wildcard, reserved per Report)
  103   | -- Context-sensitive keywords (not strictly reserved per Report, but needed for imports)
  104     TkKeywordQualified
  105   | TkKeywordAs
  106   | TkKeywordHiding
  107   | -- Reserved operators (per Haskell Report Section 2.4)
  108     TkReservedDotDot -- ..
  109   | TkReservedColon -- :
  110   | TkReservedDoubleColon -- ::
  111   | TkReservedEquals -- =
  112   | TkReservedBackslash -- \
  113   | TkReservedPipe -- \"|\"
  114   | TkReservedLeftArrow -- <-
  115   | TkReservedRightArrow -- ->
  116   | TkReservedAt -- @
  117   | -- Note: ~ is NOT reserved; it uses whitespace-sensitive lexing (GHC proposal 0229)
  118     TkReservedDoubleArrow -- =>
  119   | -- Identifiers (per Haskell Report Section 2.4)
  120     TkVarId Text -- variable identifier (starts lowercase/_)
  121   | TkConId Text -- constructor identifier (starts uppercase)
  122   | TkQVarId Text -- qualified variable identifier
  123   | TkQConId Text -- qualified constructor identifier
  124   | -- Operators (per Haskell Report Section 2.4)
  125     TkVarSym Text -- variable symbol (doesn't start with :)
  126   | TkConSym Text -- constructor symbol (starts with :)
  127   | TkQVarSym Text -- qualified variable symbol
  128   | TkQConSym Text -- qualified constructor symbol
  129   | -- Literals
  130     TkInteger Integer
  131   | TkIntegerBase Integer Text
  132   | TkFloat Double Text
  133   | TkChar Char
  134   | TkString Text
  135   | -- Special characters (per Haskell Report Section 2.2)
  136     TkSpecialLParen -- (
  137   | TkSpecialRParen -- )
  138   | TkSpecialComma -- ,
  139   | TkSpecialSemicolon -- ;
  140   | TkSpecialLBracket -- [
  141   | TkSpecialRBracket -- ]
  142   | TkSpecialBacktick -- `
  143   | TkSpecialLBrace -- {
  144   | TkSpecialRBrace -- }
  145   | -- LexicalNegation support
  146     TkMinusOperator -- minus operator when LexicalNegation enabled (before prefix detection)
  147   | TkPrefixMinus -- prefix minus (tight, no space) for LexicalNegation
  148   | -- Whitespace-sensitive operator support (GHC proposal 0229)
  149     TkPrefixBang -- prefix bang (!x) for bang patterns
  150   | TkPrefixTilde -- prefix tilde (~x) for irrefutable patterns
  151   | -- Pragmas
  152     TkPragmaLanguage [ExtensionSetting]
  153   | TkPragmaWarning Text
  154   | TkPragmaDeprecated Text
  155   | -- Other
  156     TkQuasiQuote Text Text
  157   | TkError Text
  158   deriving (Eq, Ord, Show, Read, Generic, NFData)
  159 
  160 data LexToken = LexToken
  161   { lexTokenKind :: !LexTokenKind,
  162     lexTokenText :: !Text,
  163     lexTokenSpan :: !SourceSpan
  164   }
  165   deriving (Eq, Ord, Show, Generic, NFData)
  166 
  167 data LexerState = LexerState
  168   { lexerInput :: String,
  169     lexerLine :: !Int,
  170     lexerCol :: !Int,
  171     lexerAtLineStart :: !Bool,
  172     lexerPending :: [LexToken],
  173     lexerExtensions :: [Extension],
  174     -- | The kind of the previous non-trivia token (for NegativeLiterals/LexicalNegation)
  175     lexerPrevTokenKind :: !(Maybe LexTokenKind),
  176     -- | Whether trivia (whitespace/comments) was skipped since the last token
  177     lexerHadTrivia :: !Bool
  178   }
  179   deriving (Eq, Show)
  180 
  181 data LayoutContext
  182   = LayoutExplicit
  183   | LayoutImplicit !Int
  184   | LayoutImplicitLet !Int
  185   | -- | Implicit layout opened after 'then do' or 'else do'.
  186     -- This variant allows 'then' and 'else' to close it at the same indent level.
  187     LayoutImplicitAfterThenElse !Int
  188   | -- | Marker for ( or [ to scope implicit layout closures
  189     LayoutDelimiter
  190   deriving (Eq, Show)
  191 
  192 data PendingLayout
  193   = PendingLayoutGeneric
  194   | PendingLayoutLet
  195   | -- | Pending layout from 'do' after 'then' or 'else'.
  196     -- The resulting layout can be closed by 'then'/'else' at the same indent.
  197     PendingLayoutAfterThenElse
  198   deriving (Eq, Show)
  199 
  200 data ModuleLayoutMode
  201   = ModuleLayoutOff
  202   | ModuleLayoutSeekStart
  203   | ModuleLayoutAwaitWhere
  204   | ModuleLayoutDone
  205   deriving (Eq, Show)
  206 
  207 data LayoutState = LayoutState
  208   { layoutContexts :: [LayoutContext],
  209     layoutPendingLayout :: !(Maybe PendingLayout),
  210     layoutPrevLine :: !(Maybe Int),
  211     layoutPrevTokenKind :: !(Maybe LexTokenKind),
  212     layoutDelimiterDepth :: !Int,
  213     layoutModuleMode :: !ModuleLayoutMode
  214   }
  215   deriving (Eq, Show)
  216 
  217 data DirectiveUpdate = DirectiveUpdate
  218   { directiveLine :: !(Maybe Int),
  219     directiveCol :: !(Maybe Int)
  220   }
  221   deriving (Eq, Show)
  222 
  223 -- | Convenience lexer entrypoint: no extensions, parse as expression/declaration stream.
  224 --
  225 -- This variant consumes a single strict 'Text' chunk and returns a lazy list of
  226 -- tokens. Lexing errors are preserved as 'TkError' tokens instead of causing
  227 -- lexing to fail.
  228 lexTokens :: Text -> [LexToken]
  229 lexTokens = lexTokensFromChunks . (: [])
  230 
  231 -- | Convenience lexer entrypoint for full modules: no explicit extension list.
  232 --
  233 -- Leading header pragmas are scanned first so module-enabled extensions can be
  234 -- applied before token rewrites and top-level layout insertion.
  235 lexModuleTokens :: Text -> [LexToken]
  236 lexModuleTokens input =
  237   lexModuleTokensFromChunks
  238     (enabledExtensionsFromSettings (readModuleHeaderExtensionsFromChunks [input]))
  239     [input]
  240 
  241 -- | Lex an expression/declaration stream from one or more input chunks.
  242 --
  243 -- Tokens are produced lazily, so downstream consumers can begin parsing before
  244 -- the full source has been scanned.
  245 lexTokensFromChunks :: [Text] -> [LexToken]
  246 lexTokensFromChunks = lexTokensFromChunksWithExtensions []
  247 
  248 -- | Lex a full module from one or more input chunks with explicit extensions.
  249 --
  250 -- This variant enables module-body layout insertion in addition to the normal
  251 -- token scan and extension rewrites.
  252 lexModuleTokensFromChunks :: [Extension] -> [Text] -> [LexToken]
  253 lexModuleTokensFromChunks = lexChunksWithExtensions True
  254 
  255 -- | Lex source text using explicit lexer extensions.
  256 --
  257 -- This runs raw tokenization, extension rewrites, and implicit-layout insertion.
  258 -- Module-top layout is /not/ enabled here. Malformed lexemes become 'TkError'
  259 -- tokens in the token stream.
  260 lexTokensWithExtensions :: [Extension] -> Text -> [LexToken]
  261 lexTokensWithExtensions exts input = lexTokensFromChunksWithExtensions exts [input]
  262 
  263 -- | Lex module source text using explicit lexer extensions.
  264 --
  265 -- Like 'lexTokensWithExtensions', but also enables top-level module-body layout:
  266 -- when the source omits explicit braces, virtual layout tokens are inserted
  267 -- after @module ... where@ (or from the first non-pragma token in module-less files).
  268 lexModuleTokensWithExtensions :: [Extension] -> Text -> [LexToken]
  269 lexModuleTokensWithExtensions exts input = lexModuleTokensFromChunks exts [input]
  270 
  271 -- | Internal chunked lexer entrypoint for non-module inputs.
  272 --
  273 -- This exists so callers can stream input through the same scanner while still
  274 -- selecting extension-driven token rewrites.
  275 lexTokensFromChunksWithExtensions :: [Extension] -> [Text] -> [LexToken]
  276 lexTokensFromChunksWithExtensions = lexChunksWithExtensions False
  277 
  278 -- | Run the full lexer pipeline over chunked input.
  279 --
  280 -- The scanner operates over the concatenated chunk stream with inline extension
  281 -- handling, then the resulting token stream is passed through the layout insertion step.
  282 lexChunksWithExtensions :: Bool -> [Extension] -> [Text] -> [LexToken]
  283 lexChunksWithExtensions enableModuleLayout exts chunks =
  284   applyLayoutTokens enableModuleLayout (scanTokens initialLexerState)
  285   where
  286     initialLexerState =
  287       LexerState
  288         { lexerInput = concatMap T.unpack chunks,
  289           lexerLine = 1,
  290           lexerCol = 1,
  291           lexerAtLineStart = True,
  292           lexerPending = [],
  293           lexerExtensions = exts,
  294           lexerPrevTokenKind = Nothing,
  295           lexerHadTrivia = True -- Start of file is treated as having leading trivia
  296         }
  297 
  298 -- | Read leading module-header pragmas and return parsed LANGUAGE settings.
  299 --
  300 -- This scans only the pragma/header prefix (allowing whitespace and comments)
  301 -- and stops at the first non-pragma token or lexer error token.
  302 readModuleHeaderExtensions :: Text -> [ExtensionSetting]
  303 readModuleHeaderExtensions input = readModuleHeaderExtensionsFromChunks [input]
  304 
  305 -- | Read leading module-header pragmas from one or more input chunks.
  306 --
  307 -- This scans only the pragma/header prefix (allowing whitespace and comments)
  308 -- and stops at the first non-pragma token or lexer error token.
  309 readModuleHeaderExtensionsFromChunks :: [Text] -> [ExtensionSetting]
  310 readModuleHeaderExtensionsFromChunks chunks = go (lexTokensFromChunks chunks)
  311   where
  312     go toks =
  313       case toks of
  314         LexToken {lexTokenKind = TkPragmaLanguage settings} : rest -> settings <> go rest
  315         LexToken {lexTokenKind = TkPragmaWarning _} : rest -> go rest
  316         LexToken {lexTokenKind = TkPragmaDeprecated _} : rest -> go rest
  317         LexToken {lexTokenKind = TkError _} : _ -> []
  318         _ -> []
  319 
  320 enabledExtensionsFromSettings :: [ExtensionSetting] -> [Extension]
  321 enabledExtensionsFromSettings = List.foldl' apply []
  322   where
  323     apply exts setting =
  324       case setting of
  325         EnableExtension ext
  326           | ext `elem` exts -> exts
  327           | otherwise -> exts <> [ext]
  328         DisableExtension ext -> filter (/= ext) exts
  329 
  330 -- | Produce the lazy stream of raw lexical tokens.
  331 --
  332 -- Pending synthetic tokens are emitted first, then trivia is skipped, and finally
  333 -- the next real token is scanned from the remaining input.
  334 --
  335 -- The lexer tracks the previous token kind and whether trivia was consumed between
  336 -- tokens, which enables inline handling of LexicalNegation and NegativeLiterals.
  337 scanTokens :: LexerState -> [LexToken]
  338 scanTokens st0 =
  339   case lexerPending st0 of
  340     tok : rest ->
  341       let st0' = st0 {lexerPending = rest, lexerPrevTokenKind = Just (lexTokenKind tok), lexerHadTrivia = False}
  342        in tok : scanTokens st0'
  343     [] ->
  344       let st = skipTrivia st0
  345        in case lexerPending st of
  346             tok : rest ->
  347               let st' = st {lexerPending = rest, lexerPrevTokenKind = Just (lexTokenKind tok), lexerHadTrivia = False}
  348                in tok : scanTokens st'
  349             []
  350               | null (lexerInput st) -> []
  351               | otherwise ->
  352                   -- Reset hadTrivia flag is already set by skipTrivia; we just lex the token
  353                   let (tok, st') = nextToken st
  354                       st'' = st' {lexerPrevTokenKind = Just (lexTokenKind tok), lexerHadTrivia = False}
  355                    in tok : scanTokens st''
  356 
  357 -- | Skip ignorable trivia until the next token boundary.
  358 --
  359 -- Control directives are treated specially: valid directives update lexer position
  360 -- state without emitting a token, while malformed directives enqueue 'TkError'
  361 -- tokens for later emission.
  362 skipTrivia :: LexerState -> LexerState
  363 skipTrivia st = maybe st skipTrivia (consumeTrivia st)
  364 
  365 consumeTrivia :: LexerState -> Maybe LexerState
  366 consumeTrivia st
  367   | null (lexerInput st) = Nothing
  368   | otherwise =
  369       case lexerInput st of
  370         c : _
  371           | c == ' ' || c == '\t' || c == '\r' -> Just (markHadTrivia (consumeWhile (\x -> x == ' ' || x == '\t' || x == '\r') st))
  372           | c == '\n' -> Just (markHadTrivia (advanceChars "\n" st))
  373         '-' : '-' : rest
  374           | isLineComment rest -> Just (markHadTrivia (consumeLineComment st))
  375         '{' : '-' : '#' : _ ->
  376           case tryConsumeControlPragma st of
  377             Just (Nothing, st') -> Just (markHadTrivia st')
  378             Just (Just tok, st') -> Just (markHadTrivia st' {lexerPending = lexerPending st' <> [tok]})
  379             Nothing ->
  380               case tryConsumeKnownPragma st of
  381                 Just _ -> Nothing
  382                 Nothing ->
  383                   markHadTrivia <$> consumeUnknownPragma st
  384         '{' : '-' : _ ->
  385           Just (markHadTrivia (consumeBlockCommentOrError st))
  386         _ ->
  387           case tryConsumeLineDirective st of
  388             Just (Nothing, st') -> Just (markHadTrivia st')
  389             Just (Just tok, st') -> Just (markHadTrivia st' {lexerPending = lexerPending st' <> [tok]})
  390             Nothing -> Nothing
  391 
  392 -- | Mark that trivia was consumed
  393 markHadTrivia :: LexerState -> LexerState
  394 markHadTrivia st = st {lexerHadTrivia = True}
  395 
  396 nextToken :: LexerState -> (LexToken, LexerState)
  397 nextToken st =
  398   fromMaybe (lexErrorToken st "unexpected character") (firstJust tokenParsers)
  399   where
  400     tokenParsers =
  401       [ lexKnownPragma,
  402         lexQuasiQuote,
  403         lexHexFloat,
  404         lexFloat,
  405         lexIntBase,
  406         lexInt,
  407         lexPromotedQuote,
  408         lexChar,
  409         lexString,
  410         lexSymbol,
  411         lexIdentifier,
  412         lexNegativeLiteralOrMinus,
  413         lexBangOrTildeOperator, -- must come before lexOperator
  414         lexOperator
  415       ]
  416 
  417     firstJust [] = Nothing
  418     firstJust (parser : rest) =
  419       case parser st of
  420         Just out -> Just out
  421         Nothing -> firstJust rest
  422 
  423 applyLayoutTokens :: Bool -> [LexToken] -> [LexToken]
  424 applyLayoutTokens enableModuleLayout =
  425   go
  426     LayoutState
  427       { layoutContexts = [],
  428         layoutPendingLayout = Nothing,
  429         layoutPrevLine = Nothing,
  430         layoutPrevTokenKind = Nothing,
  431         layoutDelimiterDepth = 0,
  432         layoutModuleMode =
  433           if enableModuleLayout
  434             then ModuleLayoutSeekStart
  435             else ModuleLayoutOff
  436       }
  437   where
  438     go st toks =
  439       case toks of
  440         [] -> closeAllImplicit (layoutContexts st) NoSourceSpan
  441         tok : rest ->
  442           let stModule = noteModuleLayoutBeforeToken st tok
  443               (preInserted, stBeforePending) = closeBeforeToken stModule tok
  444               (pendingInserted, stAfterPending, skipBOL) = openPendingLayout stBeforePending tok
  445               (bolInserted, stAfterBOL) = if skipBOL then ([], stAfterPending) else bolLayout stAfterPending tok
  446               stAfterToken = noteModuleLayoutAfterToken (stepTokenContext stAfterBOL tok) tok
  447               stNext =
  448                 stAfterToken
  449                   { layoutPrevLine = Just (tokenStartLine tok),
  450                     layoutPrevTokenKind = Just (lexTokenKind tok)
  451                   }
  452            in preInserted <> pendingInserted <> bolInserted <> (tok : go stNext rest)
  453 
  454 noteModuleLayoutBeforeToken :: LayoutState -> LexToken -> LayoutState
  455 noteModuleLayoutBeforeToken st tok =
  456   case layoutModuleMode st of
  457     ModuleLayoutSeekStart ->
  458       case lexTokenKind tok of
  459         TkPragmaLanguage _ -> st
  460         TkPragmaWarning _ -> st
  461         TkPragmaDeprecated _ -> st
  462         TkKeywordModule -> st {layoutModuleMode = ModuleLayoutAwaitWhere}
  463         _ -> st {layoutModuleMode = ModuleLayoutDone, layoutPendingLayout = Just PendingLayoutGeneric}
  464     _ -> st
  465 
  466 noteModuleLayoutAfterToken :: LayoutState -> LexToken -> LayoutState
  467 noteModuleLayoutAfterToken st tok =
  468   case layoutModuleMode st of
  469     ModuleLayoutAwaitWhere
  470       | lexTokenKind tok == TkKeywordWhere ->
  471           st {layoutModuleMode = ModuleLayoutDone, layoutPendingLayout = Just PendingLayoutGeneric}
  472     _ -> st
  473 
  474 openPendingLayout :: LayoutState -> LexToken -> ([LexToken], LayoutState, Bool)
  475 openPendingLayout st tok =
  476   case layoutPendingLayout st of
  477     Nothing -> ([], st, False)
  478     Just pending ->
  479       case lexTokenKind tok of
  480         TkSpecialLBrace -> ([], st {layoutPendingLayout = Nothing}, False)
  481         _ ->
  482           let col = tokenStartCol tok
  483               parentIndent = currentLayoutIndent (layoutContexts st)
  484               openTok = virtualSymbolToken "{" (lexTokenSpan tok)
  485               closeTok = virtualSymbolToken "}" (lexTokenSpan tok)
  486               newContext =
  487                 case pending of
  488                   PendingLayoutGeneric -> LayoutImplicit col
  489                   PendingLayoutLet -> LayoutImplicitLet col
  490                   PendingLayoutAfterThenElse -> LayoutImplicitAfterThenElse col
  491            in if col <= parentIndent
  492                 then ([openTok, closeTok], st {layoutPendingLayout = Nothing}, False)
  493                 else
  494                   ( [openTok],
  495                     st
  496                       { layoutPendingLayout = Nothing,
  497                         layoutContexts = newContext : layoutContexts st
  498                       },
  499                     True
  500                   )
  501 
  502 closeBeforeToken :: LayoutState -> LexToken -> ([LexToken], LayoutState)
  503 closeBeforeToken st tok =
  504   case lexTokenKind tok of
  505     TkKeywordIn ->
  506       let (inserted, contexts') = closeLeadingImplicitLets (lexTokenSpan tok) (layoutContexts st)
  507        in (inserted, st {layoutContexts = contexts'})
  508     TkSpecialComma
  509       | layoutDelimiterDepth st == 0 ->
  510           let (inserted, contexts') = closeLeadingImplicitLets (lexTokenSpan tok) (layoutContexts st)
  511            in (inserted, st {layoutContexts = contexts'})
  512     -- Close implicit layout contexts before closing delimiters (parse-error rule)
  513     TkSpecialRParen ->
  514       let (inserted, contexts') = closeAllImplicitBeforeDelimiter (lexTokenSpan tok) (layoutContexts st)
  515        in (inserted, st {layoutContexts = contexts'})
  516     TkSpecialRBracket ->
  517       let (inserted, contexts') = closeAllImplicitBeforeDelimiter (lexTokenSpan tok) (layoutContexts st)
  518        in (inserted, st {layoutContexts = contexts'})
  519     TkSpecialRBrace ->
  520       let (inserted, contexts') = closeAllImplicitBeforeDelimiter (lexTokenSpan tok) (layoutContexts st)
  521        in (inserted, st {layoutContexts = contexts'})
  522     -- Close implicit layout contexts before 'then' and 'else' keywords (parse-error rule)
  523     -- These keywords cannot appear inside a do block, so we close contexts at >= their column.
  524     TkKeywordThen ->
  525       let col = tokenStartCol tok
  526           (inserted, contexts') = closeForDedentInclusive col (lexTokenSpan tok) (layoutContexts st)
  527        in (inserted, st {layoutContexts = contexts'})
  528     TkKeywordElse ->
  529       let col = tokenStartCol tok
  530           (inserted, contexts') = closeForDedentInclusive col (lexTokenSpan tok) (layoutContexts st)
  531        in (inserted, st {layoutContexts = contexts'})
  532     -- Close implicit layout contexts before 'where' keyword (parse-error rule)
  533     -- 'where' at the same column as an implicit layout closes that layout,
  534     -- allowing it to attach to the enclosing definition.
  535     TkKeywordWhere ->
  536       let col = tokenStartCol tok
  537           (inserted, contexts') = closeForDedentInclusiveAll col (lexTokenSpan tok) (layoutContexts st)
  538        in (inserted, st {layoutContexts = contexts'})
  539     _ -> ([], st)
  540 
  541 bolLayout :: LayoutState -> LexToken -> ([LexToken], LayoutState)
  542 bolLayout st tok
  543   | not (isBOL st tok) = ([], st)
  544   | otherwise =
  545       let col = tokenStartCol tok
  546           (inserted, contexts') = closeForDedent col (lexTokenSpan tok) (layoutContexts st)
  547           eqSemi =
  548             case currentLayoutIndentMaybe contexts' of
  549               Just indent
  550                 | col == indent,
  551                   not (suppressesVirtualSemicolon tok) ->
  552                     [virtualSymbolToken ";" (lexTokenSpan tok)]
  553               _ -> []
  554        in (inserted <> eqSemi, st {layoutContexts = contexts'})
  555 
  556 suppressesVirtualSemicolon :: LexToken -> Bool
  557 suppressesVirtualSemicolon tok =
  558   case lexTokenKind tok of
  559     TkKeywordThen -> True
  560     TkKeywordElse -> True
  561     TkReservedDoubleArrow -> True -- =>
  562     TkReservedRightArrow -> True -- ->
  563     TkReservedEquals -> True -- =
  564     TkReservedPipe -> True
  565     TkReservedDoubleColon -> True -- ::
  566     _ -> False
  567 
  568 closeForDedent :: Int -> SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
  569 closeForDedent col anchor = go []
  570   where
  571     go acc contexts =
  572       case contexts of
  573         LayoutImplicit indent : rest
  574           | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
  575           | otherwise -> (reverse acc, contexts)
  576         LayoutImplicitLet indent : rest
  577           | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
  578           | otherwise -> (reverse acc, contexts)
  579         LayoutImplicitAfterThenElse indent : rest
  580           | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
  581           | otherwise -> (reverse acc, contexts)
  582         _ -> (reverse acc, contexts)
  583 
  584 -- | Close layout contexts opened after 'then do' or 'else do' when encountering
  585 -- 'then' or 'else' at the same or lesser indent. This handles the parse-error rule
  586 -- for these specific cases where the keyword cannot be part of the do block.
  587 --
  588 -- This function first closes any implicit layouts with indent > col (regular dedent),
  589 -- then closes LayoutImplicitAfterThenElse contexts where col <= indent.
  590 -- This ensures that nested layouts (like case blocks) are closed before
  591 -- the then/else-specific layout closing.
  592 closeForDedentInclusive :: Int -> SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
  593 closeForDedentInclusive col anchor = go []
  594   where
  595     go acc contexts =
  596       case contexts of
  597         -- Close any implicit layout with indent > col (dedent rule)
  598         LayoutImplicit indent : rest
  599           | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
  600           | otherwise -> (reverse acc, contexts)
  601         LayoutImplicitLet indent : rest
  602           | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
  603           | otherwise -> (reverse acc, contexts)
  604         -- Close LayoutImplicitAfterThenElse where col <= indent (parse-error rule)
  605         LayoutImplicitAfterThenElse indent : rest
  606           | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
  607           | otherwise -> (reverse acc, contexts)
  608         _ -> (reverse acc, contexts)
  609 
  610 -- | Close all implicit layout contexts at or above the given column.
  611 -- Used for 'where' which needs to close all enclosing implicit layouts
  612 -- (not just LayoutImplicitAfterThenElse like then/else).
  613 closeForDedentInclusiveAll :: Int -> SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
  614 closeForDedentInclusiveAll col anchor = go []
  615   where
  616     go acc contexts =
  617       case contexts of
  618         LayoutImplicit indent : rest
  619           | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
  620           | otherwise -> (reverse acc, contexts)
  621         LayoutImplicitLet indent : rest
  622           | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
  623           | otherwise -> (reverse acc, contexts)
  624         LayoutImplicitAfterThenElse indent : rest
  625           | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
  626           | otherwise -> (reverse acc, contexts)
  627         _ -> (reverse acc, contexts)
  628 
  629 closeAllImplicit :: [LayoutContext] -> SourceSpan -> [LexToken]
  630 closeAllImplicit contexts anchor =
  631   [virtualSymbolToken "}" anchor | ctx <- contexts, isImplicitLayoutContext ctx]
  632 
  633 closeLeadingImplicitLets :: SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
  634 closeLeadingImplicitLets anchor = go []
  635   where
  636     go acc contexts =
  637       case contexts of
  638         LayoutImplicitLet _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
  639         _ -> (reverse acc, contexts)
  640 
  641 -- | Close all implicit layout contexts up to (but not including) the first explicit context.
  642 -- Used to implement the Haskell Report's "parse-error" rule for closing delimiters.
  643 closeAllImplicitBeforeDelimiter :: SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
  644 closeAllImplicitBeforeDelimiter anchor = go []
  645   where
  646     go acc contexts =
  647       case contexts of
  648         LayoutImplicit _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
  649         LayoutImplicitLet _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
  650         LayoutImplicitAfterThenElse _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
  651         _ -> (reverse acc, contexts)
  652 
  653 stepTokenContext :: LayoutState -> LexToken -> LayoutState
  654 stepTokenContext st tok =
  655   case lexTokenKind tok of
  656     TkKeywordDo
  657       | layoutPrevTokenKind st == Just TkKeywordThen
  658           || layoutPrevTokenKind st == Just TkKeywordElse ->
  659           st {layoutPendingLayout = Just PendingLayoutAfterThenElse}
  660       | otherwise -> st {layoutPendingLayout = Just PendingLayoutGeneric}
  661     TkKeywordOf -> st {layoutPendingLayout = Just PendingLayoutGeneric}
  662     TkKeywordCase
  663       | layoutPrevTokenKind st == Just TkReservedBackslash ->
  664           st {layoutPendingLayout = Just PendingLayoutGeneric}
  665       | otherwise -> st
  666     TkKeywordLet -> st {layoutPendingLayout = Just PendingLayoutLet}
  667     TkKeywordWhere -> st {layoutPendingLayout = Just PendingLayoutGeneric}
  668     TkSpecialLParen ->
  669       st
  670         { layoutDelimiterDepth = layoutDelimiterDepth st + 1,
  671           layoutContexts = LayoutDelimiter : layoutContexts st
  672         }
  673     TkSpecialLBracket ->
  674       st
  675         { layoutDelimiterDepth = layoutDelimiterDepth st + 1,
  676           layoutContexts = LayoutDelimiter : layoutContexts st
  677         }
  678     TkSpecialRParen ->
  679       st
  680         { layoutDelimiterDepth = max 0 (layoutDelimiterDepth st - 1),
  681           layoutContexts = popToDelimiter (layoutContexts st)
  682         }
  683     TkSpecialRBracket ->
  684       st
  685         { layoutDelimiterDepth = max 0 (layoutDelimiterDepth st - 1),
  686           layoutContexts = popToDelimiter (layoutContexts st)
  687         }
  688     TkSpecialLBrace -> st {layoutContexts = LayoutExplicit : layoutContexts st}
  689     TkSpecialRBrace -> st {layoutContexts = popOneContext (layoutContexts st)}
  690     _ -> st
  691 
  692 -- | Pop the layout context stack up to and including the nearest LayoutDelimiter.
  693 popToDelimiter :: [LayoutContext] -> [LayoutContext]
  694 popToDelimiter contexts =
  695   case contexts of
  696     LayoutDelimiter : rest -> rest
  697     _ : rest -> popToDelimiter rest
  698     [] -> []
  699 
  700 popOneContext :: [LayoutContext] -> [LayoutContext]
  701 popOneContext contexts =
  702   case contexts of
  703     _ : rest -> rest
  704     [] -> []
  705 
  706 currentLayoutIndent :: [LayoutContext] -> Int
  707 currentLayoutIndent contexts = fromMaybe 0 (currentLayoutIndentMaybe contexts)
  708 
  709 currentLayoutIndentMaybe :: [LayoutContext] -> Maybe Int
  710 currentLayoutIndentMaybe contexts =
  711   case contexts of
  712     LayoutImplicit indent : _ -> Just indent
  713     LayoutImplicitLet indent : _ -> Just indent
  714     LayoutImplicitAfterThenElse indent : _ -> Just indent
  715     _ -> Nothing
  716 
  717 isImplicitLayoutContext :: LayoutContext -> Bool
  718 isImplicitLayoutContext ctx =
  719   case ctx of
  720     LayoutImplicit _ -> True
  721     LayoutImplicitLet _ -> True
  722     LayoutImplicitAfterThenElse _ -> True
  723     LayoutExplicit -> False
  724     LayoutDelimiter -> False
  725 
  726 isBOL :: LayoutState -> LexToken -> Bool
  727 isBOL st tok =
  728   case layoutPrevLine st of
  729     Just prevLine -> tokenStartLine tok > prevLine
  730     Nothing -> False
  731 
  732 tokenStartLine :: LexToken -> Int
  733 tokenStartLine tok =
  734   case lexTokenSpan tok of
  735     SourceSpan line _ _ _ -> line
  736     NoSourceSpan -> 1
  737 
  738 tokenStartCol :: LexToken -> Int
  739 tokenStartCol tok =
  740   case lexTokenSpan tok of
  741     SourceSpan _ col _ _ -> col
  742     NoSourceSpan -> 1
  743 
  744 virtualSymbolToken :: Text -> SourceSpan -> LexToken
  745 virtualSymbolToken sym span' =
  746   LexToken
  747     { lexTokenKind = case sym of
  748         "{" -> TkSpecialLBrace
  749         "}" -> TkSpecialRBrace
  750         ";" -> TkSpecialSemicolon
  751         _ -> error ("virtualSymbolToken: unexpected symbol " ++ T.unpack sym),
  752       lexTokenText = sym,
  753       lexTokenSpan = span'
  754     }
  755 
  756 lexKnownPragma :: LexerState -> Maybe (LexToken, LexerState)
  757 lexKnownPragma st
  758   | Just ((raw, kind), st') <- parsePragmaLike parseLanguagePragma st = Just (mkToken st st' raw kind, st')
  759   | Just ((raw, kind), st') <- parsePragmaLike parseOptionsPragma st = Just (mkToken st st' raw kind, st')
  760   | Just ((raw, kind), st') <- parsePragmaLike parseWarningPragma st = Just (mkToken st st' raw kind, st')
  761   | Just ((raw, kind), st') <- parsePragmaLike parseDeprecatedPragma st = Just (mkToken st st' raw kind, st')
  762   | otherwise = Nothing
  763 
  764 parsePragmaLike :: (String -> Maybe (Int, (Text, LexTokenKind))) -> LexerState -> Maybe ((Text, LexTokenKind), LexerState)
  765 parsePragmaLike parser st = do
  766   (n, out) <- parser (lexerInput st)
  767   pure (out, advanceChars (take n (lexerInput st)) st)
  768 
  769 lexIdentifier :: LexerState -> Maybe (LexToken, LexerState)
  770 lexIdentifier st =
  771   case lexerInput st of
  772     c : rest
  773       | isIdentStart c ->
  774           let (seg, rest0) = span isIdentTail rest
  775               firstChunk = c : seg
  776               (consumed, rest1, isQualified) = gatherQualified firstChunk rest0
  777            in -- Check if we have a qualified operator (e.g., Prelude.+)
  778               case (isQualified || isAsciiUpper c, rest1) of
  779                 (True, '.' : opChar : opRest)
  780                   | isSymbolicOpCharNotDot opChar ->
  781                       -- This is a qualified operator like Prelude.+ or A.B.C.:++
  782                       let (opChars, _) = span isSymbolicOpChar (opChar : opRest)
  783                           fullOp = consumed <> "." <> opChars
  784                           opTxt = T.pack fullOp
  785                           kind =
  786                             if opChar == ':'
  787                               then TkQConSym opTxt
  788                               else TkQVarSym opTxt
  789                           st' = advanceChars fullOp st
  790                        in Just (mkToken st st' opTxt kind, st')
  791                 _ ->
  792                   -- Regular identifier
  793                   let ident = T.pack consumed
  794                       kind = classifyIdentifier c isQualified ident
  795                       st' = advanceChars consumed st
  796                    in Just (mkToken st st' ident kind, st')
  797     _ -> Nothing
  798   where
  799     -- Returns (consumed, remaining, isQualified)
  800     gatherQualified acc chars =
  801       case chars of
  802         '.' : c' : more
  803           | isIdentStart c' ->
  804               let (seg, rest) = span isIdentTail more
  805                in gatherQualified (acc <> "." <> [c'] <> seg) rest
  806         _ -> (acc, chars, '.' `elem` acc)
  807 
  808     -- Check for symbol char that is not '.' to avoid consuming module path dots
  809     isSymbolicOpCharNotDot c = isSymbolicOpChar c && c /= '.'
  810 
  811     classifyIdentifier firstChar isQualified ident
  812       | isQualified =
  813           -- Qualified name: use final part to determine var/con
  814           let finalPart = T.takeWhileEnd (/= '.') ident
  815               firstCharFinal = T.head finalPart
  816            in if isAsciiUpper firstCharFinal
  817                 then TkQConId ident
  818                 else TkQVarId ident
  819       | otherwise =
  820           -- Unqualified: check for keyword first
  821           case keywordTokenKind ident of
  822             Just kw -> kw
  823             Nothing ->
  824               if isAsciiUpper firstChar
  825                 then TkConId ident
  826                 else TkVarId ident
  827 
  828 -- | Handle minus in the context of NegativeLiterals and LexicalNegation extensions.
  829 --
  830 -- This function is called when the input starts with '-' and either NegativeLiterals
  831 -- or LexicalNegation is enabled. It handles the following cases:
  832 --
  833 -- 1. NegativeLiterals: If '-' is immediately followed by a numeric literal (no space),
  834 -- | Handle minus in the context of NegativeLiterals and LexicalNegation extensions.
  835 --
  836 -- When NegativeLiterals is enabled and context allows, attempts to lex a negative
  837 -- literal by consuming '-' and delegating to existing number lexers.
  838 --
  839 -- When LexicalNegation is enabled, emits TkPrefixMinus or TkMinusOperator based
  840 -- on position.
  841 --
  842 -- Otherwise, return Nothing and let lexOperator handle it.
  843 lexNegativeLiteralOrMinus :: LexerState -> Maybe (LexToken, LexerState)
  844 lexNegativeLiteralOrMinus st
  845   | not hasNegExt = Nothing
  846   | not (isStandaloneMinus (lexerInput st)) = Nothing
  847   | otherwise =
  848       let prevAllows = allowsMergeOrPrefix (lexerPrevTokenKind st) (lexerHadTrivia st)
  849           rest = drop 1 (lexerInput st) -- input after '-'
  850        in if NegativeLiterals `elem` lexerExtensions st && prevAllows
  851             then case tryLexNumberAfterMinus st of
  852               Just result -> Just result
  853               Nothing -> lexMinusOperator st rest prevAllows
  854             else lexMinusOperator st rest prevAllows
  855   where
  856     hasNegExt =
  857       NegativeLiterals `elem` lexerExtensions st
  858         || LexicalNegation `elem` lexerExtensions st
  859 
  860 -- | Check if input starts with a standalone '-' (not part of ->, -<, etc.)
  861 isStandaloneMinus :: String -> Bool
  862 isStandaloneMinus input =
  863   case input of
  864     '-' : c : _ | isSymbolicOpChar c && c /= '-' -> False -- part of multi-char op
  865     '-' : _ -> True
  866     _ -> False
  867 
  868 -- | Try to lex a negative number by delegating to existing number lexers.
  869 -- Consumes '-', runs number lexers on remainder, negates result if successful.
  870 tryLexNumberAfterMinus :: LexerState -> Maybe (LexToken, LexerState)
  871 tryLexNumberAfterMinus st = do
  872   -- Create a temporary state positioned after the '-'
  873   let stAfterMinus = advanceChars "-" st
  874   -- Try existing number lexers in order (most specific first)
  875   (numTok, stFinal) <- firstJust numberLexers stAfterMinus
  876   -- Build combined negative token
  877   Just (negateToken st numTok, stFinal)
  878   where
  879     numberLexers = [lexHexFloat, lexFloat, lexIntBase, lexInt]
  880 
  881     firstJust [] _ = Nothing
  882     firstJust (f : fs) s = case f s of
  883       Just result -> Just result
  884       Nothing -> firstJust fs s
  885 
  886 -- | Negate a numeric token and adjust its span/text to include the leading '-'.
  887 negateToken :: LexerState -> LexToken -> LexToken
  888 negateToken stBefore numTok =
  889   LexToken
  890     { lexTokenKind = negateKind (lexTokenKind numTok),
  891       lexTokenText = "-" <> lexTokenText numTok,
  892       lexTokenSpan = extendSpanLeft (lexTokenSpan numTok)
  893     }
  894   where
  895     negateKind k = case k of
  896       TkInteger n -> TkInteger (negate n)
  897       TkIntegerBase n repr -> TkIntegerBase (negate n) ("-" <> repr)
  898       TkFloat n repr -> TkFloat (negate n) ("-" <> repr)
  899       other -> other -- shouldn't happen
  900 
  901     -- Extend span to start at the '-' position
  902     extendSpanLeft sp = case sp of
  903       SourceSpan _ _ endLine endCol ->
  904         SourceSpan (lexerLine stBefore) (lexerCol stBefore) endLine endCol
  905       NoSourceSpan -> NoSourceSpan
  906 
  907 -- | Emit TkPrefixMinus or TkMinusOperator based on LexicalNegation rules.
  908 lexMinusOperator :: LexerState -> String -> Bool -> Maybe (LexToken, LexerState)
  909 lexMinusOperator st rest prevAllows
  910   | LexicalNegation `notElem` lexerExtensions st = Nothing
  911   | otherwise =
  912       let st' = advanceChars "-" st
  913           kind =
  914             if prevAllows && canStartNegatedAtom rest
  915               then TkPrefixMinus
  916               else TkMinusOperator
  917        in Just (mkToken st st' "-" kind, st')
  918 
  919 -- | Check if the preceding token context allows a merge (NegativeLiterals) or
  920 -- prefix minus (LexicalNegation).
  921 --
  922 -- The merge/prefix is allowed when:
  923 -- - There is no previous token (start of input)
  924 -- - There was whitespace/trivia before the '-'
  925 -- - The previous token is an operator or punctuation that allows tight unary prefix
  926 allowsMergeOrPrefix :: Maybe LexTokenKind -> Bool -> Bool
  927 allowsMergeOrPrefix prev hadTrivia =
  928   case prev of
  929     Nothing -> True
  930     Just _ | hadTrivia -> True
  931     Just prevKind -> prevTokenAllowsTightPrefix prevKind
  932 
  933 -- | Check if the preceding token allows a tight unary prefix (like negation).
  934 -- This is used when there's no whitespace between the previous token and '-'.
  935 prevTokenAllowsTightPrefix :: LexTokenKind -> Bool
  936 prevTokenAllowsTightPrefix kind =
  937   case kind of
  938     TkSpecialLParen -> True
  939     TkSpecialLBracket -> True
  940     TkSpecialLBrace -> True
  941     TkSpecialComma -> True
  942     TkSpecialSemicolon -> True
  943     TkVarSym _ -> True
  944     TkConSym _ -> True
  945     TkQVarSym _ -> True
  946     TkQConSym _ -> True
  947     TkMinusOperator -> True
  948     TkPrefixMinus -> True
  949     TkReservedEquals -> True
  950     TkReservedLeftArrow -> True
  951     TkReservedRightArrow -> True
  952     TkReservedDoubleArrow -> True
  953     TkReservedDoubleColon -> True
  954     TkReservedPipe -> True
  955     TkReservedBackslash -> True
  956     _ -> False
  957 
  958 -- | Check if the given input could start a negated atom (for LexicalNegation).
  959 canStartNegatedAtom :: String -> Bool
  960 canStartNegatedAtom rest =
  961   case rest of
  962     [] -> False
  963     c : _
  964       | isIdentStart c -> True -- identifier
  965       | isDigit c -> True -- number
  966       | c == '\'' -> True -- char literal
  967       | c == '"' -> True -- string literal
  968       | c == '(' -> True -- parenthesized expression
  969       | c == '[' -> True -- list/TH brackets
  970       | c == '\\' -> True -- lambda
  971       | c == '-' -> True -- nested negation
  972       | otherwise -> False
  973 
  974 -- | Whitespace-sensitive lexing for ! and ~ operators (GHC proposal 0229).
  975 --
  976 -- Per the proposal, these operators are classified based on surrounding whitespace:
  977 --   - Prefix occurrence: whitespace before, no whitespace after → bang/lazy pattern
  978 --   - Otherwise (loose infix, tight infix, suffix) → regular operator
  979 --
  980 -- Examples:
  981 --   a ! b   -- loose infix (operator)
  982 --   a!b     -- tight infix (operator)
  983 --   a !b    -- prefix (bang pattern)
  984 --   a! b    -- suffix (operator)
  985 lexBangOrTildeOperator :: LexerState -> Maybe (LexToken, LexerState)
  986 lexBangOrTildeOperator st =
  987   case lexerInput st of
  988     '!' : rest -> lexPrefixSensitiveOp st '!' "!" TkPrefixBang rest
  989     '~' : rest -> lexPrefixSensitiveOp st '~' "~" TkPrefixTilde rest
  990     _ -> Nothing
  991 
  992 -- | Lex a whitespace-sensitive prefix operator.
  993 -- Returns TkPrefixBang/TkPrefixTilde if in prefix position, otherwise Nothing
  994 -- to let lexOperator handle it as a regular VarSym.
  995 --
  996 -- Per GHC proposal 0229, prefix position is determined by:
  997 --   - Whitespace before the operator, OR
  998 --   - Previous token is an opening bracket/punctuation that allows tight prefix
  999 -- AND no whitespace after (next char can start a pattern atom).
 1000 lexPrefixSensitiveOp :: LexerState -> Char -> String -> LexTokenKind -> String -> Maybe (LexToken, LexerState)
 1001 lexPrefixSensitiveOp st opChar opStr prefixKind rest
 1002   -- Only handle single-character ! or ~ (not part of multi-char operator like !=)
 1003   | isMultiCharOp rest = Nothing
 1004   -- Prefix position: (whitespace before OR opening token before) AND next char can start a pattern atom
 1005   | isPrefixPosition && canStartPrefixPatternAtom rest =
 1006       let st' = advanceChars opStr st
 1007        in Just (mkToken st st' (T.singleton opChar) prefixKind, st')
 1008   -- Otherwise, let lexOperator handle it as a regular operator
 1009   | otherwise = Nothing
 1010   where
 1011     -- Check if rest starts with another symbolic operator char (making this a multi-char op)
 1012     isMultiCharOp (c : _) = isSymbolicOpChar c
 1013     isMultiCharOp [] = False
 1014     -- Prefix position is allowed when:
 1015     -- - There is no previous token (start of input)
 1016     -- - There was whitespace/trivia before the operator
 1017     -- - The previous token is an opening bracket or punctuation that allows tight prefix
 1018     isPrefixPosition =
 1019       case lexerPrevTokenKind st of
 1020         Nothing -> True -- start of input
 1021         Just prevKind
 1022           | lexerHadTrivia st -> True -- had whitespace before
 1023           | otherwise -> prevTokenAllowsTightPrefix prevKind -- opening bracket, etc.
 1024 
 1025 -- | Check if the given input could start a pattern atom (for prefix ! and ~).
 1026 -- This is similar to canStartNegatedAtom but tailored for patterns.
 1027 canStartPrefixPatternAtom :: String -> Bool
 1028 canStartPrefixPatternAtom rest =
 1029   case rest of
 1030     [] -> False
 1031     c : _
 1032       | isIdentStart c -> True -- identifier (variable or constructor)
 1033       | isDigit c -> True -- numeric literal
 1034       | c == '\'' -> True -- char literal
 1035       | c == '"' -> True -- string literal
 1036       | c == '(' -> True -- parenthesized pattern or tuple
 1037       | c == '[' -> True -- list pattern
 1038       | c == '_' -> True -- wildcard
 1039       | c == '!' -> True -- nested bang pattern
 1040       | c == '~' -> True -- nested lazy pattern
 1041       | otherwise -> False
 1042 
 1043 lexOperator :: LexerState -> Maybe (LexToken, LexerState)
 1044 lexOperator st =
 1045   case span isSymbolicOpChar (lexerInput st) of
 1046     (op@(c : _), _) ->
 1047       let txt = T.pack op
 1048           st' = advanceChars op st
 1049           hasUnicode = UnicodeSyntax `elem` lexerExtensions st
 1050           kind = case reservedOpTokenKind txt of
 1051             Just reserved -> reserved
 1052             Nothing
 1053               | hasUnicode -> unicodeOpTokenKind txt c
 1054               | c == ':' -> TkConSym txt
 1055               | otherwise -> TkVarSym txt
 1056        in Just (mkToken st st' txt kind, st')
 1057     _ -> Nothing
 1058 
 1059 -- | Map Unicode operators to their ASCII equivalents when UnicodeSyntax is enabled.
 1060 -- Returns the appropriate token kind for known Unicode operators, or falls back
 1061 -- to TkVarSym/TkConSym based on whether the first character is ':'.
 1062 unicodeOpTokenKind :: Text -> Char -> LexTokenKind
 1063 unicodeOpTokenKind txt firstChar =
 1064   case T.unpack txt of
 1065     "∷" -> TkReservedDoubleColon -- :: (proportion)
 1066     "⇒" -> TkReservedDoubleArrow -- => (rightwards double arrow)
 1067     "→" -> TkReservedRightArrow -- -> (rightwards arrow)
 1068     "←" -> TkReservedLeftArrow -- <- (leftwards arrow)
 1069     "∀" -> TkVarId "forall" -- forall (for all)
 1070     "★" -> TkVarSym "*" -- star (for kind signatures)
 1071     "⤙" -> TkVarSym "-<" -- -< (leftwards arrow-tail)
 1072     "⤚" -> TkVarSym ">-" -- >- (rightwards arrow-tail)
 1073     "⤛" -> TkVarSym "-<<" -- -<< (leftwards double arrow-tail)
 1074     "⤜" -> TkVarSym ">>-" -- >>- (rightwards double arrow-tail)
 1075     "⦇" -> TkVarSym "(|" -- (| left banana bracket
 1076     "⦈" -> TkVarSym "|)" -- right banana bracket |)
 1077     "⟦" -> TkVarSym "[|" -- [| left semantic bracket
 1078     "⟧" -> TkVarSym "|]" -- right semantic bracket |]
 1079     "⊸" -> TkVarSym "%1->" -- %1-> (linear arrow)
 1080     _
 1081       | firstChar == ':' -> TkConSym txt
 1082       | otherwise -> TkVarSym txt
 1083 
 1084 lexSymbol :: LexerState -> Maybe (LexToken, LexerState)
 1085 lexSymbol st =
 1086   firstJust
 1087     [ ("(", TkSpecialLParen),
 1088       (")", TkSpecialRParen),
 1089       ("[", TkSpecialLBracket),
 1090       ("]", TkSpecialRBracket),
 1091       ("{", TkSpecialLBrace),
 1092       ("}", TkSpecialRBrace),
 1093       (",", TkSpecialComma),
 1094       (";", TkSpecialSemicolon),
 1095       ("`", TkSpecialBacktick)
 1096     ]
 1097   where
 1098     firstJust xs =
 1099       case xs of
 1100         [] -> Nothing
 1101         (txt, kind) : rest ->
 1102           if txt `List.isPrefixOf` lexerInput st
 1103             then
 1104               let st' = advanceChars txt st
 1105                in Just (mkToken st st' (T.pack txt) kind, st')
 1106             else firstJust rest
 1107 
 1108 lexIntBase :: LexerState -> Maybe (LexToken, LexerState)
 1109 lexIntBase st =
 1110   case lexerInput st of
 1111     '0' : base : rest
 1112       | base `elem` ("xXoObB" :: String) ->
 1113           let allowUnderscores = NumericUnderscores `elem` lexerExtensions st
 1114               isDigitChar
 1115                 | base `elem` ("xX" :: String) = isHexDigit
 1116                 | base `elem` ("oO" :: String) = isOctDigit
 1117                 | otherwise = (`elem` ("01" :: String))
 1118               (digitsRaw, _) = takeDigitsWithLeadingUnderscores allowUnderscores isDigitChar rest
 1119            in if null digitsRaw
 1120                 then Nothing
 1121                 else
 1122                   let raw = '0' : base : digitsRaw
 1123                       txt = T.pack raw
 1124                       n
 1125                         | base `elem` ("xX" :: String) = readHexLiteral txt
 1126                         | base `elem` ("oO" :: String) = readOctLiteral txt
 1127                         | otherwise = readBinLiteral txt
 1128                       st' = advanceChars raw st
 1129                    in Just (mkToken st st' txt (TkIntegerBase n txt), st')
 1130     _ -> Nothing
 1131 
 1132 lexHexFloat :: LexerState -> Maybe (LexToken, LexerState)
 1133 lexHexFloat st = do
 1134   ('0' : x : rest) <- Just (lexerInput st)
 1135   if x `notElem` ("xX" :: String)
 1136     then Nothing
 1137     else do
 1138       let (intDigits, rest1) = span isHexDigit rest
 1139       if null intDigits
 1140         then Nothing
 1141         else do
 1142           let (mFracDigits, rest2) =
 1143                 case rest1 of
 1144                   '.' : more ->
 1145                     let (frac, rest') = span isHexDigit more
 1146                      in (Just frac, rest')
 1147                   _ -> (Nothing, rest1)
 1148           expo@(_ : expoRest) <- takeHexExponent rest2
 1149           let fracDigits = fromMaybe "" mFracDigits
 1150           if null expoRest
 1151             then Nothing
 1152             else
 1153               let dotAndFrac =
 1154                     case mFracDigits of
 1155                       Just ds -> '.' : ds
 1156                       Nothing -> ""
 1157                   raw = '0' : x : intDigits <> dotAndFrac <> expo
 1158                   txt = T.pack raw
 1159                   value = parseHexFloatLiteral intDigits fracDigits expo
 1160                   st' = advanceChars raw st
 1161                in Just (mkToken st st' txt (TkFloat value txt), st')
 1162 
 1163 lexFloat :: LexerState -> Maybe (LexToken, LexerState)
 1164 lexFloat st =
 1165   let allowUnderscores = NumericUnderscores `elem` lexerExtensions st
 1166       (lhsRaw, rest) = takeDigitsWithUnderscores allowUnderscores isDigit (lexerInput st)
 1167    in if null lhsRaw
 1168         then Nothing
 1169         else case rest of
 1170           '.' : d : more
 1171             | isDigit d ->
 1172                 let (rhsRaw, rest') = takeDigitsWithUnderscores allowUnderscores isDigit (d : more)
 1173                     (expo, _) = takeExponent allowUnderscores rest'
 1174                     raw = lhsRaw <> "." <> rhsRaw <> expo
 1175                     txt = T.pack raw
 1176                     normalized = filter (/= '_') raw
 1177                     st' = advanceChars raw st
 1178                  in Just (mkToken st st' txt (TkFloat (read normalized) txt), st')
 1179           _ ->
 1180             case takeExponent allowUnderscores rest of
 1181               ("", _) -> Nothing
 1182               (expo, _) ->
 1183                 let raw = lhsRaw <> expo
 1184                     txt = T.pack raw
 1185                     normalized = filter (/= '_') raw
 1186                     st' = advanceChars raw st
 1187                  in Just (mkToken st st' txt (TkFloat (read normalized) txt), st')
 1188 
 1189 lexInt :: LexerState -> Maybe (LexToken, LexerState)
 1190 lexInt st =
 1191   let allowUnderscores = NumericUnderscores `elem` lexerExtensions st
 1192       (digitsRaw, _) = takeDigitsWithUnderscores allowUnderscores isDigit (lexerInput st)
 1193    in if null digitsRaw
 1194         then Nothing
 1195         else
 1196           let txt = T.pack digitsRaw
 1197               digits = filter (/= '_') digitsRaw
 1198               st' = advanceChars digitsRaw st
 1199            in Just (mkToken st st' txt (TkInteger (read digits)), st')
 1200 
 1201 lexPromotedQuote :: LexerState -> Maybe (LexToken, LexerState)
 1202 lexPromotedQuote st
 1203   | DataKinds `notElem` lexerExtensions st = Nothing
 1204   | otherwise =
 1205       case lexerInput st of
 1206         '\'' : rest
 1207           | isValidCharLiteral rest -> Nothing
 1208           | isPromotionStart rest ->
 1209               let st' = advanceChars "'" st
 1210                in Just (mkToken st st' "'" (TkVarSym "'"), st')
 1211           | otherwise -> Nothing
 1212         _ -> Nothing
 1213   where
 1214     isValidCharLiteral chars =
 1215       case scanQuoted '\'' chars of
 1216         Right (body, _) -> isJust (readMaybeChar ('\'' : body <> "'"))
 1217         Left _ -> False
 1218 
 1219     isPromotionStart chars =
 1220       case chars of
 1221         c : _
 1222           | c == '[' -> True
 1223           | c == '(' -> True
 1224           | c == ':' -> True
 1225           | isAsciiUpper c -> True
 1226         _ -> False
 1227 
 1228 lexChar :: LexerState -> Maybe (LexToken, LexerState)
 1229 lexChar st =
 1230   case lexerInput st of
 1231     '\'' : rest ->
 1232       case scanQuoted '\'' rest of
 1233         Right (body, _) ->
 1234           let raw = '\'' : body <> "'"
 1235               st' = advanceChars raw st
 1236            in case readMaybeChar raw of
 1237                 Just c -> Just (mkToken st st' (T.pack raw) (TkChar c), st')
 1238                 Nothing -> Just (mkErrorToken st st' (T.pack raw) "invalid char literal", st')
 1239         Left raw ->
 1240           let full = '\'' : raw
 1241               st' = advanceChars full st
 1242            in Just (mkErrorToken st st' (T.pack full) "unterminated char literal", st')
 1243     _ -> Nothing
 1244 
 1245 lexString :: LexerState -> Maybe (LexToken, LexerState)
 1246 lexString st =
 1247   case lexerInput st of
 1248     '"' : rest ->
 1249       case scanQuoted '"' rest of
 1250         Right (body, _) ->
 1251           let raw = "\"" <> body <> "\""
 1252               decoded =
 1253                 case reads raw of
 1254                   [(str, "")] -> T.pack str
 1255                   _ -> T.pack body
 1256               st' = advanceChars raw st
 1257            in Just (mkToken st st' (T.pack raw) (TkString decoded), st')
 1258         Left raw ->
 1259           let full = '"' : raw
 1260               st' = advanceChars full st
 1261            in Just (mkErrorToken st st' (T.pack full) "unterminated string literal", st')
 1262     _ -> Nothing
 1263 
 1264 lexQuasiQuote :: LexerState -> Maybe (LexToken, LexerState)
 1265 lexQuasiQuote st =
 1266   case lexerInput st of
 1267     '[' : rest ->
 1268       case parseQuasiQuote rest of
 1269         Just (quoter, body, _) ->
 1270           let raw = "[" <> quoter <> "|" <> body <> "|]"
 1271               st' = advanceChars raw st
 1272            in Just (mkToken st st' (T.pack raw) (TkQuasiQuote (T.pack quoter) (T.pack body)), st')
 1273         Nothing -> Nothing
 1274     _ -> Nothing
 1275   where
 1276     parseQuasiQuote chars =
 1277       let (quoter, rest0) = takeQuoter chars
 1278        in case rest0 of
 1279             '|' : rest1
 1280               | not (null quoter) ->
 1281                   let (body, rest2) = breakOnMarker "|]" rest1
 1282                    in case rest2 of
 1283                         '|' : ']' : _ -> Just (quoter, body, take (length body + 2) rest1)
 1284                         _ -> Nothing
 1285             _ -> Nothing
 1286 
 1287 lexErrorToken :: LexerState -> Text -> (LexToken, LexerState)
 1288 lexErrorToken st msg =
 1289   let raw = take 1 (lexerInput st)
 1290       rawTxt = if null raw then "<eof>" else T.pack raw
 1291       st' = if null raw then st else advanceChars raw st
 1292    in ( mkErrorToken st st' rawTxt msg,
 1293         st'
 1294       )
 1295 
 1296 mkErrorToken :: LexerState -> LexerState -> Text -> Text -> LexToken
 1297 mkErrorToken start end rawTxt msg =
 1298   mkToken start end rawTxt (TkError msg)
 1299 
 1300 tryConsumeLineDirective :: LexerState -> Maybe (Maybe LexToken, LexerState)
 1301 tryConsumeLineDirective st
 1302   | not (lexerAtLineStart st) = Nothing
 1303   | otherwise =
 1304       let (spaces, rest) = span (\c -> c == ' ' || c == '\t') (lexerInput st)
 1305        in case rest of
 1306             '#' : more ->
 1307               let lineText = '#' : takeLineRemainder more
 1308                   consumed = spaces <> lineText
 1309                in case parseHashLineDirective lineText of
 1310                     Just update ->
 1311                       Just (Nothing, applyDirectiveAdvance consumed update st)
 1312                     Nothing ->
 1313                       let st' = advanceChars consumed st
 1314                        in Just (Just (mkToken st st' (T.pack consumed) (TkError "malformed line directive")), st')
 1315             _ -> Nothing
 1316 
 1317 tryConsumeControlPragma :: LexerState -> Maybe (Maybe LexToken, LexerState)
 1318 tryConsumeControlPragma st =
 1319   case parseControlPragma (lexerInput st) of
 1320     Just (consumed0, Right update0) ->
 1321       let (consumed, update) =
 1322             case directiveLine update0 of
 1323               Just lineNo ->
 1324                 case drop (length consumed0) (lexerInput st) of
 1325                   '\n' : _ ->
 1326                     (consumed0 <> "\n", update0 {directiveLine = Just lineNo, directiveCol = Just 1})
 1327                   _ -> (consumed0, update0)
 1328               Nothing -> (consumed0, update0)
 1329        in Just (Nothing, applyDirectiveAdvance consumed update st)
 1330     Just (consumed, Left msg) ->
 1331       let st' = advanceChars consumed st
 1332        in Just (Just (mkToken st st' (T.pack consumed) (TkError msg)), st')
 1333     Nothing -> Nothing
 1334 
 1335 applyDirectiveAdvance :: String -> DirectiveUpdate -> LexerState -> LexerState
 1336 applyDirectiveAdvance consumed update st =
 1337   let hasTrailingNewline =
 1338         case reverse consumed of
 1339           '\n' : _ -> True
 1340           _ -> False
 1341    in st
 1342         { lexerInput = drop (length consumed) (lexerInput st),
 1343           lexerLine = maybe (lexerLine st) (max 1) (directiveLine update),
 1344           lexerCol = maybe (lexerCol st) (max 1) (directiveCol update),
 1345           lexerAtLineStart = hasTrailingNewline || (Just 1 == directiveCol update)
 1346         }
 1347 
 1348 consumeLineComment :: LexerState -> LexerState
 1349 consumeLineComment st = advanceChars consumed st
 1350   where
 1351     consumed = takeCommentRemainder (drop 2 (lexerInput st))
 1352     prefix = "--"
 1353     takeCommentRemainder xs =
 1354       prefix <> takeWhile (/= '\n') xs
 1355 
 1356 consumeUnknownPragma :: LexerState -> Maybe LexerState
 1357 consumeUnknownPragma st =
 1358   case breakOnMarker "#-}" (lexerInput st) of
 1359     (_, "") -> Nothing
 1360     (body, marker) ->
 1361       let consumed = body <> take 3 marker
 1362        in Just (advanceChars consumed st)
 1363 
 1364 consumeBlockComment :: LexerState -> Maybe LexerState
 1365 consumeBlockComment st =
 1366   case scanNestedBlockComment 1 (drop 2 (lexerInput st)) of
 1367     Just consumedTail -> Just (advanceChars ("{-" <> consumedTail) st)
 1368     Nothing -> Nothing
 1369 
 1370 consumeBlockCommentOrError :: LexerState -> LexerState
 1371 consumeBlockCommentOrError st =
 1372   case consumeBlockComment st of
 1373     Just st' -> st'
 1374     Nothing ->
 1375       let consumed = lexerInput st
 1376           st' = advanceChars consumed st
 1377           tok = mkToken st st' (T.pack consumed) (TkError "unterminated block comment")
 1378        in st' {lexerPending = lexerPending st' <> [tok]}
 1379 
 1380 scanNestedBlockComment :: Int -> String -> Maybe String
 1381 scanNestedBlockComment depth chars
 1382   | depth <= 0 = Just ""
 1383   | otherwise =
 1384       case chars of
 1385         [] -> Nothing
 1386         '{' : '-' : rest -> ("{-" <>) <$> scanNestedBlockComment (depth + 1) rest
 1387         '-' : '}' : rest ->
 1388           if depth == 1
 1389             then Just "-}"
 1390             else ("-}" <>) <$> scanNestedBlockComment (depth - 1) rest
 1391         c : rest -> (c :) <$> scanNestedBlockComment depth rest
 1392 
 1393 tryConsumeKnownPragma :: LexerState -> Maybe ()
 1394 tryConsumeKnownPragma st =
 1395   case lexKnownPragma st of
 1396     Just _ -> Just ()
 1397     Nothing -> Nothing
 1398 
 1399 parseLanguagePragma :: String -> Maybe (Int, (Text, LexTokenKind))
 1400 parseLanguagePragma input = do
 1401   (_, body, consumed) <- stripNamedPragma ["LANGUAGE"] input
 1402   let names = parseLanguagePragmaNames (T.pack body)
 1403       raw = "{-# LANGUAGE " <> T.unpack (T.intercalate ", " (map extensionSettingName names)) <> " #-}"
 1404   pure (length consumed, (T.pack raw, TkPragmaLanguage names))
 1405 
 1406 parseOptionsPragma :: String -> Maybe (Int, (Text, LexTokenKind))
 1407 parseOptionsPragma input = do
 1408   (pragmaName, body, consumed) <- stripNamedPragma ["OPTIONS_GHC", "OPTIONS"] input
 1409   let settings = parseOptionsPragmaSettings (T.pack body)
 1410       raw = "{-# " <> pragmaName <> " " <> T.unpack (T.strip (T.pack body)) <> " #-}"
 1411   pure (length consumed, (T.pack raw, TkPragmaLanguage settings))
 1412 
 1413 parseWarningPragma :: String -> Maybe (Int, (Text, LexTokenKind))
 1414 parseWarningPragma input = do
 1415   (_, body, consumed) <- stripNamedPragma ["WARNING"] input
 1416   let txt = T.strip (T.pack body)
 1417       (msg, rawMsg) =
 1418         case body of
 1419           '"' : _ ->
 1420             case reads body of
 1421               [(decoded, "")] -> (T.pack decoded, T.pack body)
 1422               _ -> (txt, txt)
 1423           _ -> (txt, txt)
 1424       raw = "{-# WARNING " <> rawMsg <> " #-}"
 1425   pure (length consumed, (raw, TkPragmaWarning msg))
 1426 
 1427 parseDeprecatedPragma :: String -> Maybe (Int, (Text, LexTokenKind))
 1428 parseDeprecatedPragma input = do
 1429   (_, body, consumed) <- stripNamedPragma ["DEPRECATED"] input
 1430   let txt = T.strip (T.pack body)
 1431       (msg, rawMsg) =
 1432         case body of
 1433           '"' : _ ->
 1434             case reads body of
 1435               [(decoded, "")] -> (T.pack decoded, T.pack body)
 1436               _ -> (txt, txt)
 1437           _ -> (txt, txt)
 1438       raw = "{-# DEPRECATED " <> rawMsg <> " #-}"
 1439   pure (length consumed, (raw, TkPragmaDeprecated msg))
 1440 
 1441 stripPragma :: String -> String -> Maybe String
 1442 stripPragma name input = (\(_, body, _) -> body) <$> stripNamedPragma [name] input
 1443 
 1444 stripNamedPragma :: [String] -> String -> Maybe (String, String, String)
 1445 stripNamedPragma names input = do
 1446   rest0 <- List.stripPrefix "{-#" input
 1447   let rest1 = dropWhile isSpace rest0
 1448   name <- List.find (`List.isPrefixOf` rest1) names
 1449   rest2 <- List.stripPrefix name rest1
 1450   let rest3 = dropWhile isSpace rest2
 1451       (body, marker) = breakOnMarker "#-}" rest3
 1452   guardPrefix "#-}" marker
 1453   let consumedLen = length input - length (drop 3 marker)
 1454   pure (name, trimRight body, take consumedLen input)
 1455 
 1456 parseLanguagePragmaNames :: Text -> [ExtensionSetting]
 1457 parseLanguagePragmaNames body =
 1458   mapMaybe (parseExtensionSettingName . T.strip . T.takeWhile (/= '#')) (T.splitOn "," body)
 1459 
 1460 parseOptionsPragmaSettings :: Text -> [ExtensionSetting]
 1461 parseOptionsPragmaSettings body = go (pragmaWords body)
 1462   where
 1463     go ws =
 1464       case ws of
 1465         [] -> []
 1466         "-cpp" : rest -> EnableExtension CPP : go rest
 1467         "-fffi" : rest -> EnableExtension ForeignFunctionInterface : go rest
 1468         "-fglasgow-exts" : rest -> glasgowExtsSettings <> go rest
 1469         opt : rest
 1470           | Just ext <- T.stripPrefix "-X" opt,
 1471             not (T.null ext) ->
 1472               case parseExtensionSettingName ext of
 1473                 Just setting -> setting : go rest
 1474                 Nothing -> go rest
 1475         _ : rest -> go rest
 1476 
 1477 glasgowExtsSettings :: [ExtensionSetting]
 1478 glasgowExtsSettings =
 1479   map
 1480     EnableExtension
 1481     [ ConstrainedClassMethods,
 1482       DeriveDataTypeable,
 1483       DeriveFoldable,
 1484       DeriveFunctor,
 1485       DeriveGeneric,
 1486       DeriveTraversable,
 1487       EmptyDataDecls,
 1488       ExistentialQuantification,
 1489       ExplicitNamespaces,
 1490       FlexibleContexts,
 1491       FlexibleInstances,
 1492       ForeignFunctionInterface,
 1493       FunctionalDependencies,
 1494       GeneralizedNewtypeDeriving,
 1495       ImplicitParams,
 1496       InterruptibleFFI,
 1497       KindSignatures,
 1498       LiberalTypeSynonyms,
 1499       MagicHash,
 1500       MultiParamTypeClasses,
 1501       ParallelListComp,
 1502       PatternGuards,
 1503       PostfixOperators,
 1504       RankNTypes,
 1505       RecursiveDo,
 1506       ScopedTypeVariables,
 1507       StandaloneDeriving,
 1508       TypeOperators,
 1509       TypeSynonymInstances,
 1510       UnboxedTuples,
 1511       UnicodeSyntax,
 1512       UnliftedFFITypes
 1513     ]
 1514 
 1515 pragmaWords :: Text -> [Text]
 1516 pragmaWords txt = go [] [] Nothing (T.unpack txt)
 1517   where
 1518     go acc current quote chars =
 1519       case chars of
 1520         [] ->
 1521           let acc' = pushCurrent acc current
 1522            in reverse acc'
 1523         c : rest ->
 1524           case quote of
 1525             Just q
 1526               | c == q -> go acc current Nothing rest
 1527               | c == '\\' ->
 1528                   case rest of
 1529                     escaped : rest' -> go acc (escaped : current) quote rest'
 1530                     [] -> go acc current quote []
 1531               | otherwise -> go acc (c : current) quote rest
 1532             Nothing
 1533               | c == '"' || c == '\'' -> go acc current (Just c) rest
 1534               | c == '\\' ->
 1535                   case rest of
 1536                     escaped : rest' -> go acc (escaped : current) Nothing rest'
 1537                     [] -> go acc current Nothing []
 1538               | c `elem` [' ', '\n', '\r', '\t'] ->
 1539                   let acc' = pushCurrent acc current
 1540                    in go acc' [] Nothing rest
 1541               | otherwise -> go acc (c : current) Nothing rest
 1542 
 1543     pushCurrent acc current =
 1544       case reverse current of
 1545         [] -> acc
 1546         token -> T.pack token : acc
 1547 
 1548 parseHashLineDirective :: String -> Maybe DirectiveUpdate
 1549 parseHashLineDirective raw =
 1550   let trimmed = trimLeft (drop 1 (trimLeft raw))
 1551       trimmed' =
 1552         if "line" `List.isPrefixOf` trimmed
 1553           then dropWhile isSpace (drop 4 trimmed)
 1554           else trimmed
 1555       (digits, _) = span isDigit trimmed'
 1556    in if null digits
 1557         then Nothing
 1558         else Just DirectiveUpdate {directiveLine = Just (read digits), directiveCol = Just 1}
 1559 
 1560 parseControlPragma :: String -> Maybe (String, Either Text DirectiveUpdate)
 1561 parseControlPragma input
 1562   | Just body <- stripPragma "LINE" input =
 1563       let trimmed = words body
 1564        in case trimmed of
 1565             lineNo : _
 1566               | all isDigit lineNo ->
 1567                   Just
 1568                     ( fullPragmaConsumed "LINE" body,
 1569                       Right DirectiveUpdate {directiveLine = Just (read lineNo), directiveCol = Just 1}
 1570                     )
 1571             _ -> Just (fullPragmaConsumed "LINE" body, Left "malformed LINE pragma")
 1572   | Just body <- stripPragma "COLUMN" input =
 1573       let trimmed = words body
 1574        in case trimmed of
 1575             colNo : _
 1576               | all isDigit colNo ->
 1577                   Just
 1578                     ( fullPragmaConsumed "COLUMN" body,
 1579                       Right DirectiveUpdate {directiveLine = Nothing, directiveCol = Just (read colNo)}
 1580                     )
 1581             _ -> Just (fullPragmaConsumed "COLUMN" body, Left "malformed COLUMN pragma")
 1582   | otherwise = Nothing
 1583 
 1584 fullPragmaConsumed :: String -> String -> String
 1585 fullPragmaConsumed name body = "{-# " <> name <> " " <> trimRight body <> " #-}"
 1586 
 1587 mkToken :: LexerState -> LexerState -> Text -> LexTokenKind -> LexToken
 1588 mkToken start end tokTxt kind =
 1589   LexToken
 1590     { lexTokenKind = kind,
 1591       lexTokenText = tokTxt,
 1592       lexTokenSpan = mkSpan start end
 1593     }
 1594 
 1595 mkSpan :: LexerState -> LexerState -> SourceSpan
 1596 mkSpan start end =
 1597   SourceSpan
 1598     { sourceSpanStartLine = lexerLine start,
 1599       sourceSpanStartCol = lexerCol start,
 1600       sourceSpanEndLine = lexerLine end,
 1601       sourceSpanEndCol = lexerCol end
 1602     }
 1603 
 1604 advanceChars :: String -> LexerState -> LexerState
 1605 advanceChars chars st = foldl advanceOne st chars
 1606   where
 1607     advanceOne acc ch =
 1608       case ch of
 1609         '\n' ->
 1610           acc
 1611             { lexerInput = drop 1 (lexerInput acc),
 1612               lexerLine = lexerLine acc + 1,
 1613               lexerCol = 1,
 1614               lexerAtLineStart = True
 1615             }
 1616         _ ->
 1617           acc
 1618             { lexerInput = drop 1 (lexerInput acc),
 1619               lexerCol = lexerCol acc + 1,
 1620               lexerAtLineStart = False
 1621             }
 1622 
 1623 consumeWhile :: (Char -> Bool) -> LexerState -> LexerState
 1624 consumeWhile f st = advanceChars (takeWhile f (lexerInput st)) st
 1625 
 1626 -- | Take digits with optional underscores.
 1627 --
 1628 -- When @allowUnderscores@ is True (NumericUnderscores enabled):
 1629 --   - Underscores may appear between digits, including consecutive underscores
 1630 --   - Leading underscores are NOT allowed (the first character must be a digit)
 1631 --   - Trailing underscores cause lexing to stop (they're not consumed)
 1632 --
 1633 -- When @allowUnderscores@ is False:
 1634 --   - No underscores are accepted; only digits are consumed
 1635 takeDigitsWithUnderscores :: Bool -> (Char -> Bool) -> String -> (String, String)
 1636 takeDigitsWithUnderscores allowUnderscores isDigitChar chars =
 1637   let (firstChunk, rest) = span isDigitChar chars
 1638    in if null firstChunk
 1639         then ("", chars)
 1640         else
 1641           if allowUnderscores
 1642             then go firstChunk rest
 1643             else (firstChunk, rest)
 1644   where
 1645     go acc xs =
 1646       case xs of
 1647         '_' : rest ->
 1648           -- Consume consecutive underscores
 1649           let (underscores, rest') = span (== '_') rest
 1650               allUnderscores = '_' : underscores
 1651               (chunk, rest'') = span isDigitChar rest'
 1652            in if null chunk
 1653                 then (acc, xs) -- Trailing underscore(s), stop here
 1654                 else go (acc <> allUnderscores <> chunk) rest''
 1655         _ -> (acc, xs)
 1656 
 1657 -- | Take digits with optional leading underscores after a base prefix.
 1658 --
 1659 -- When @allowUnderscores@ is True (NumericUnderscores enabled):
 1660 --   - Leading underscores are allowed (e.g., 0x_ff, 0x__ff)
 1661 --   - Underscores may appear between digits, including consecutive underscores
 1662 --   - Trailing underscores cause lexing to stop
 1663 --
 1664 -- When @allowUnderscores@ is False:
 1665 --   - No underscores are accepted; only digits are consumed
 1666 takeDigitsWithLeadingUnderscores :: Bool -> (Char -> Bool) -> String -> (String, String)
 1667 takeDigitsWithLeadingUnderscores allowUnderscores isDigitChar chars
 1668   | not allowUnderscores =
 1669       let (digits, rest) = span isDigitChar chars
 1670        in (digits, rest)
 1671   | otherwise =
 1672       -- With NumericUnderscores, leading underscores are allowed
 1673       let (leadingUnderscores, rest0) = span (== '_') chars
 1674           (firstChunk, rest1) = span isDigitChar rest0
 1675        in if null firstChunk
 1676             then ("", chars) -- Must have at least one digit somewhere
 1677             else go (leadingUnderscores <> firstChunk) rest1
 1678   where
 1679     go acc xs =
 1680       case xs of
 1681         '_' : rest ->
 1682           let (underscores, rest') = span (== '_') rest
 1683               allUnderscores = '_' : underscores
 1684               (chunk, rest'') = span isDigitChar rest'
 1685            in if null chunk
 1686                 then (acc, xs)
 1687                 else go (acc <> allUnderscores <> chunk) rest''
 1688         _ -> (acc, xs)
 1689 
 1690 -- | Parse the exponent part of a float literal (e.g., "e+10", "E-5").
 1691 --
 1692 -- When @allowUnderscores@ is True (NumericUnderscores enabled):
 1693 --   - Underscores may appear before the exponent marker (e.g., "1_e+23")
 1694 --   - This is handled by consuming trailing underscores from the mantissa into the exponent
 1695 --   - Underscores may also appear within the exponent digits
 1696 --
 1697 -- When @allowUnderscores@ is False:
 1698 --   - Only digits are accepted in the exponent
 1699 takeExponent :: Bool -> String -> (String, String)
 1700 takeExponent allowUnderscores chars =
 1701   case chars of
 1702     -- Handle leading underscores before 'e'/'E' when NumericUnderscores enabled
 1703     '_' : rest
 1704       | allowUnderscores ->
 1705           let (underscores, rest') = span (== '_') rest
 1706               allUnderscores = '_' : underscores
 1707            in case rest' of
 1708                 marker : rest2
 1709                   | marker `elem` ("eE" :: String) ->
 1710                       let (signPart, rest3) =
 1711                             case rest2 of
 1712                               sign : more | sign `elem` ("+-" :: String) -> ([sign], more)
 1713                               _ -> ("", rest2)
 1714                           (digits, rest4) = takeDigitsWithUnderscores allowUnderscores isDigit rest3
 1715                        in if null digits
 1716                             then ("", chars)
 1717                             else (allUnderscores <> [marker] <> signPart <> digits, rest4)
 1718                 _ -> ("", chars)
 1719     marker : rest
 1720       | marker `elem` ("eE" :: String) ->
 1721           let (signPart, rest1) =
 1722                 case rest of
 1723                   sign : more | sign `elem` ("+-" :: String) -> ([sign], more)
 1724                   _ -> ("", rest)
 1725               (digits, rest2) = takeDigitsWithUnderscores allowUnderscores isDigit rest1
 1726            in if null digits then ("", chars) else (marker : signPart <> digits, rest2)
 1727     _ -> ("", chars)
 1728 
 1729 takeHexExponent :: String -> Maybe String
 1730 takeHexExponent chars =
 1731   case chars of
 1732     marker : rest
 1733       | marker `elem` ("pP" :: String) ->
 1734           let (signPart, rest1) =
 1735                 case rest of
 1736                   sign : more | sign `elem` ("+-" :: String) -> ([sign], more)
 1737                   _ -> ("", rest)
 1738               (digits, _) = span isDigit rest1
 1739            in if null digits then Nothing else Just (marker : signPart <> digits)
 1740     _ -> Nothing
 1741 
 1742 scanQuoted :: Char -> String -> Either String (String, String)
 1743 scanQuoted endCh = go []
 1744   where
 1745     go acc chars =
 1746       case chars of
 1747         [] -> Left (reverse acc)
 1748         c : rest
 1749           | c == endCh -> Right (reverse acc, rest)
 1750           | c == '\\' ->
 1751               case rest of
 1752                 escaped : rest' -> go (escaped : c : acc) rest'
 1753                 [] -> Left (reverse (c : acc))
 1754           | otherwise -> go (c : acc) rest
 1755 
 1756 takeQuoter :: String -> (String, String)
 1757 takeQuoter chars =
 1758   case chars of
 1759     c : rest
 1760       | isIdentStart c ->
 1761           let (tailChars, rest0) = span isIdentTailOrStart rest
 1762            in go (c : tailChars) rest0
 1763     _ -> ("", chars)
 1764   where
 1765     go acc chars' =
 1766       case chars' of
 1767         '.' : c : rest
 1768           | isIdentStart c ->
 1769               let (tailChars, rest0) = span isIdentTailOrStart rest
 1770                in go (acc <> "." <> [c] <> tailChars) rest0
 1771         _ -> (acc, chars')
 1772 
 1773 breakOnMarker :: String -> String -> (String, String)
 1774 breakOnMarker marker = go []
 1775   where
 1776     go acc chars
 1777       | marker `List.isPrefixOf` chars = (reverse acc, chars)
 1778       | otherwise =
 1779           case chars of
 1780             [] -> (reverse acc, [])
 1781             c : rest -> go (c : acc) rest
 1782 
 1783 takeLineRemainder :: String -> String
 1784 takeLineRemainder chars =
 1785   let (prefix, rest) = break (== '\n') chars
 1786    in prefix <> take 1 rest
 1787 
 1788 trimLeft :: String -> String
 1789 trimLeft = dropWhile isSpace
 1790 
 1791 trimRight :: String -> String
 1792 trimRight = reverse . dropWhile isSpace . reverse
 1793 
 1794 guardPrefix :: (Eq a) => [a] -> [a] -> Maybe ()
 1795 guardPrefix prefix actual =
 1796   if prefix `List.isPrefixOf` actual
 1797     then Just ()
 1798     else Nothing
 1799 
 1800 readMaybeChar :: String -> Maybe Char
 1801 readMaybeChar raw =
 1802   case reads raw of
 1803     [(c, "")] -> Just c
 1804     _ -> Nothing
 1805 
 1806 readHexLiteral :: Text -> Integer
 1807 readHexLiteral txt =
 1808   case readHex (T.unpack (T.filter (/= '_') (T.drop 2 txt))) of
 1809     [(n, "")] -> n
 1810     _ -> 0
 1811 
 1812 readOctLiteral :: Text -> Integer
 1813 readOctLiteral txt =
 1814   case readOct (T.unpack (T.filter (/= '_') (T.drop 2 txt))) of
 1815     [(n, "")] -> n
 1816     _ -> 0
 1817 
 1818 readBinLiteral :: Text -> Integer
 1819 readBinLiteral txt =
 1820   case readInt 2 (`elem` ("01" :: String)) digitToInt (T.unpack (T.filter (/= '_') (T.drop 2 txt))) of
 1821     [(n, "")] -> n
 1822     _ -> 0
 1823 
 1824 parseHexFloatLiteral :: String -> String -> String -> Double
 1825 parseHexFloatLiteral intDigits fracDigits expo =
 1826   (parseHexDigits intDigits + parseHexFraction fracDigits) * (2 ^^ exponentValue expo)
 1827 
 1828 parseHexDigits :: String -> Double
 1829 parseHexDigits = foldl (\acc d -> acc * 16 + fromIntegral (digitToInt d)) 0
 1830 
 1831 parseHexFraction :: String -> Double
 1832 parseHexFraction ds =
 1833   sum [fromIntegral (digitToInt d) / (16 ^^ i) | (d, i) <- zip ds [1 :: Int ..]]
 1834 
 1835 exponentValue :: String -> Int
 1836 exponentValue expo =
 1837   case expo of
 1838     _ : '-' : ds -> negate (read ds)
 1839     _ : '+' : ds -> read ds
 1840     _ : ds -> read ds
 1841     _ -> 0
 1842 
 1843 isIdentStart :: Char -> Bool
 1844 isIdentStart c = isAsciiUpper c || isAsciiLower c || c == '_'
 1845 
 1846 isIdentTail :: Char -> Bool
 1847 isIdentTail c = isAlphaNum c || c == '_' || c == '\''
 1848 
 1849 isSymbolicOpChar :: Char -> Bool
 1850 isSymbolicOpChar c = c `elem` (":!#$%&*+./<=>?@\\^|-~" :: String) || isUnicodeSymbol c
 1851 
 1852 -- | Unicode symbols that may be used with UnicodeSyntax extension.
 1853 -- These are: ∷ ⇒ → ← ∀ ★ ⤙ ⤚ ⤛ ⤜ ⦇ ⦈ ⟦ ⟧ ⊸
 1854 isUnicodeSymbol :: Char -> Bool
 1855 isUnicodeSymbol c =
 1856   c == '∷' -- U+2237 PROPORTION (for ::)
 1857     || c == '⇒' -- U+21D2 RIGHTWARDS DOUBLE ARROW (for =>)
 1858     || c == '→' -- U+2192 RIGHTWARDS ARROW (for ->)
 1859     || c == '←' -- U+2190 LEFTWARDS ARROW (for <-)
 1860     || c == '∀' -- U+2200 FOR ALL (for forall)
 1861     || c == '★' -- U+2605 BLACK STAR (for *)
 1862     || c == '⤙' -- U+2919 LEFTWARDS ARROW-TAIL (for -<)
 1863     || c == '⤚' -- U+291A RIGHTWARDS ARROW-TAIL (for >-)
 1864     || c == '⤛' -- U+291B LEFTWARDS DOUBLE ARROW-TAIL (for -<<)
 1865     || c == '⤜' -- U+291C RIGHTWARDS DOUBLE ARROW-TAIL (for >>-)
 1866     || c == '⦇' -- U+2987 Z NOTATION LEFT IMAGE BRACKET (for (|)
 1867     || c == '⦈' -- U+2988 Z NOTATION RIGHT IMAGE BRACKET (for |))
 1868     || c == '⟦' -- U+27E6 MATHEMATICAL LEFT WHITE SQUARE BRACKET (for [|)
 1869     || c == '⟧' -- U+27E7 MATHEMATICAL RIGHT WHITE SQUARE BRACKET (for |])
 1870     || c == '⊸' -- U+22B8 MULTIMAP (for %1-> with LinearTypes)
 1871 
 1872 -- | Check if the remainder after '--' should start a line comment.
 1873 -- Per Haskell Report: '--' starts a comment only if the entire symbol sequence
 1874 -- consists solely of dashes, or is not followed by any symbol character.
 1875 -- E.g., '-- foo' is a comment, '---' is a comment, but '-->' is an operator.
 1876 isLineComment :: String -> Bool
 1877 isLineComment rest =
 1878   case rest of
 1879     [] -> True -- Just '--' followed by nothing or whitespace
 1880     c : _
 1881       | c == '-' -> isLineComment (dropWhile (== '-') rest) -- More dashes, keep checking
 1882       | isSymbolicOpChar c -> False -- Non-dash symbol char means it's an operator
 1883       | otherwise -> True -- Non-symbol char means comment
 1884 
 1885 isIdentTailOrStart :: Char -> Bool
 1886 isIdentTailOrStart c = isAlphaNum c || c == '_' || c == '\''
 1887 
 1888 isReservedIdentifier :: Text -> Bool
 1889 isReservedIdentifier = isJust . keywordTokenKind
 1890 
 1891 keywordTokenKind :: Text -> Maybe LexTokenKind
 1892 keywordTokenKind txt = case txt of
 1893   "case" -> Just TkKeywordCase
 1894   "class" -> Just TkKeywordClass
 1895   "data" -> Just TkKeywordData
 1896   "default" -> Just TkKeywordDefault
 1897   "deriving" -> Just TkKeywordDeriving
 1898   "do" -> Just TkKeywordDo
 1899   "else" -> Just TkKeywordElse
 1900   "foreign" -> Just TkKeywordForeign
 1901   "if" -> Just TkKeywordIf
 1902   "import" -> Just TkKeywordImport
 1903   "in" -> Just TkKeywordIn
 1904   "infix" -> Just TkKeywordInfix
 1905   "infixl" -> Just TkKeywordInfixl
 1906   "infixr" -> Just TkKeywordInfixr
 1907   "instance" -> Just TkKeywordInstance
 1908   "let" -> Just TkKeywordLet
 1909   "module" -> Just TkKeywordModule
 1910   "newtype" -> Just TkKeywordNewtype
 1911   "of" -> Just TkKeywordOf
 1912   "then" -> Just TkKeywordThen
 1913   "type" -> Just TkKeywordType
 1914   "where" -> Just TkKeywordWhere
 1915   "_" -> Just TkKeywordUnderscore
 1916   -- Context-sensitive keywords (not strictly reserved per Report)
 1917   "qualified" -> Just TkKeywordQualified
 1918   "as" -> Just TkKeywordAs
 1919   "hiding" -> Just TkKeywordHiding
 1920   _ -> Nothing
 1921 
 1922 -- | Classify reserved operators per Haskell Report Section 2.4.
 1923 reservedOpTokenKind :: Text -> Maybe LexTokenKind
 1924 reservedOpTokenKind txt = case txt of
 1925   ".." -> Just TkReservedDotDot
 1926   ":" -> Just TkReservedColon
 1927   "::" -> Just TkReservedDoubleColon
 1928   "=" -> Just TkReservedEquals
 1929   "\\" -> Just TkReservedBackslash
 1930   "|" -> Just TkReservedPipe
 1931   "<-" -> Just TkReservedLeftArrow
 1932   "->" -> Just TkReservedRightArrow
 1933   "@" -> Just TkReservedAt
 1934   -- Note: ~ is NOT reserved; it uses whitespace-sensitive lexing (GHC proposal 0229)
 1935   "=>" -> Just TkReservedDoubleArrow
 1936   _ -> Nothing