never executed always true always false
1 {-# LANGUAGE DeriveAnyClass #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE OverloadedStrings #-}
4
5 -- |
6 -- Module : Aihc.Parser.Lex
7 -- Description : Lex Haskell source into span-annotated tokens with inline extension handling
8 --
9 -- This module performs the pre-parse tokenization step for Haskell source code.
10 -- It turns raw text into 'LexToken's that preserve:
11 --
12 -- * a semantic token classification ('LexTokenKind')
13 -- * the original token text ('lexTokenText')
14 -- * source location information ('lexTokenSpan')
15 --
16 -- The lexer runs in two phases:
17 --
18 -- 1. /Raw tokenization/ with a custom incremental scanner that consumes one or more
19 -- input chunks and emits tokens lazily. Extension-specific lexing (such as
20 -- @NegativeLiterals@ and @LexicalNegation@) is handled inline during this phase
21 -- by tracking the previous token context.
22 -- 2. /Layout insertion/ ('applyLayoutTokens') that inserts virtual @{@, @;@ and @}@
23 -- according to indentation (the offside rule), so the parser can treat implicit
24 -- layout like explicit braces and semicolons.
25 --
26 -- Scanning is incremental and error-tolerant:
27 --
28 -- * token production starts as soon as enough input is available
29 -- * malformed lexemes produce 'TkError' tokens instead of aborting lexing
30 -- * @# ...@, @#line ...@, @{-# LINE #-}@ and @{-# COLUMN #-}@ are handled in-band by
31 -- the lexer and update subsequent token spans without being exposed as normal tokens
32 --
33 -- Layout-sensitive syntax is the tricky part. The implementation tracks a stack of
34 -- layout contexts and mirrors the @haskell-src-exts@ model summarized in
35 -- @docs/hse-indentation-layout.md@:
36 --
37 -- * after layout-introducing keywords (currently @do@, @of@, @let@, @where@, @\\case@, plus optional module
38 -- body layout), mark a pending implicit block
39 -- * if the next token is an explicit @{@, disable implicit insertion for that block
40 -- * otherwise, open an implicit layout context at the next token column
41 -- * at beginning-of-line tokens, dedent emits virtual @}@, equal-indent emits virtual
42 -- @;@ (with a small suppression rule for @then@/@else@)
43 --
44 -- Keyword classification is intentionally lexical and exact. 'lexIdentifier'
45 -- produces a keyword token /only/ when the full identifier text exactly matches a
46 -- reserved word in 'keywordTokenKind'. That means:
47 --
48 -- * @where@ becomes 'TkKeywordWhere'
49 -- * @where'@, @_where@, and @M.where@ remain identifiers
50 --
51 -- In other words, use keyword tokens only for exact reserved lexemes; contextual
52 -- validity is left to the parser.
53 module Aihc.Parser.Lex
54 ( LexToken (..),
55 LexTokenKind (..),
56 isReservedIdentifier,
57 readModuleHeaderExtensions,
58 readModuleHeaderExtensionsFromChunks,
59 lexTokensFromChunks,
60 lexModuleTokensFromChunks,
61 lexTokensWithExtensions,
62 lexModuleTokensWithExtensions,
63 lexTokens,
64 lexModuleTokens,
65 )
66 where
67
68 import Aihc.Parser.Syntax
69 import Control.DeepSeq (NFData)
70 import Data.Char (digitToInt, isAlphaNum, isAsciiLower, isAsciiUpper, isDigit, isHexDigit, isOctDigit, isSpace)
71 import Data.List qualified as List
72 import Data.Maybe (fromMaybe, isJust, mapMaybe)
73 import Data.Text (Text)
74 import Data.Text qualified as T
75 import GHC.Generics (Generic)
76 import Numeric (readHex, readInt, readOct)
77
78 data LexTokenKind
79 = -- Keywords (reserved identifiers per Haskell Report Section 2.4)
80 TkKeywordCase
81 | TkKeywordClass
82 | TkKeywordData
83 | TkKeywordDefault
84 | TkKeywordDeriving
85 | TkKeywordDo
86 | TkKeywordElse
87 | TkKeywordForeign
88 | TkKeywordIf
89 | TkKeywordImport
90 | TkKeywordIn
91 | TkKeywordInfix
92 | TkKeywordInfixl
93 | TkKeywordInfixr
94 | TkKeywordInstance
95 | TkKeywordLet
96 | TkKeywordModule
97 | TkKeywordNewtype
98 | TkKeywordOf
99 | TkKeywordThen
100 | TkKeywordType
101 | TkKeywordWhere
102 | TkKeywordUnderscore -- _ (wildcard, reserved per Report)
103 | -- Context-sensitive keywords (not strictly reserved per Report, but needed for imports)
104 TkKeywordQualified
105 | TkKeywordAs
106 | TkKeywordHiding
107 | -- Reserved operators (per Haskell Report Section 2.4)
108 TkReservedDotDot -- ..
109 | TkReservedColon -- :
110 | TkReservedDoubleColon -- ::
111 | TkReservedEquals -- =
112 | TkReservedBackslash -- \
113 | TkReservedPipe -- \"|\"
114 | TkReservedLeftArrow -- <-
115 | TkReservedRightArrow -- ->
116 | TkReservedAt -- @
117 | -- Note: ~ is NOT reserved; it uses whitespace-sensitive lexing (GHC proposal 0229)
118 TkReservedDoubleArrow -- =>
119 | -- Identifiers (per Haskell Report Section 2.4)
120 TkVarId Text -- variable identifier (starts lowercase/_)
121 | TkConId Text -- constructor identifier (starts uppercase)
122 | TkQVarId Text -- qualified variable identifier
123 | TkQConId Text -- qualified constructor identifier
124 | -- Operators (per Haskell Report Section 2.4)
125 TkVarSym Text -- variable symbol (doesn't start with :)
126 | TkConSym Text -- constructor symbol (starts with :)
127 | TkQVarSym Text -- qualified variable symbol
128 | TkQConSym Text -- qualified constructor symbol
129 | -- Literals
130 TkInteger Integer
131 | TkIntegerBase Integer Text
132 | TkFloat Double Text
133 | TkChar Char
134 | TkString Text
135 | -- Special characters (per Haskell Report Section 2.2)
136 TkSpecialLParen -- (
137 | TkSpecialRParen -- )
138 | TkSpecialComma -- ,
139 | TkSpecialSemicolon -- ;
140 | TkSpecialLBracket -- [
141 | TkSpecialRBracket -- ]
142 | TkSpecialBacktick -- `
143 | TkSpecialLBrace -- {
144 | TkSpecialRBrace -- }
145 | -- LexicalNegation support
146 TkMinusOperator -- minus operator when LexicalNegation enabled (before prefix detection)
147 | TkPrefixMinus -- prefix minus (tight, no space) for LexicalNegation
148 | -- Whitespace-sensitive operator support (GHC proposal 0229)
149 TkPrefixBang -- prefix bang (!x) for bang patterns
150 | TkPrefixTilde -- prefix tilde (~x) for irrefutable patterns
151 | -- Pragmas
152 TkPragmaLanguage [ExtensionSetting]
153 | TkPragmaWarning Text
154 | TkPragmaDeprecated Text
155 | -- Other
156 TkQuasiQuote Text Text
157 | TkError Text
158 deriving (Eq, Ord, Show, Read, Generic, NFData)
159
160 data LexToken = LexToken
161 { lexTokenKind :: !LexTokenKind,
162 lexTokenText :: !Text,
163 lexTokenSpan :: !SourceSpan
164 }
165 deriving (Eq, Ord, Show, Generic, NFData)
166
167 data LexerState = LexerState
168 { lexerInput :: String,
169 lexerLine :: !Int,
170 lexerCol :: !Int,
171 lexerAtLineStart :: !Bool,
172 lexerPending :: [LexToken],
173 lexerExtensions :: [Extension],
174 -- | The kind of the previous non-trivia token (for NegativeLiterals/LexicalNegation)
175 lexerPrevTokenKind :: !(Maybe LexTokenKind),
176 -- | Whether trivia (whitespace/comments) was skipped since the last token
177 lexerHadTrivia :: !Bool
178 }
179 deriving (Eq, Show)
180
181 data LayoutContext
182 = LayoutExplicit
183 | LayoutImplicit !Int
184 | LayoutImplicitLet !Int
185 | -- | Implicit layout opened after 'then do' or 'else do'.
186 -- This variant allows 'then' and 'else' to close it at the same indent level.
187 LayoutImplicitAfterThenElse !Int
188 | -- | Marker for ( or [ to scope implicit layout closures
189 LayoutDelimiter
190 deriving (Eq, Show)
191
192 data PendingLayout
193 = PendingLayoutGeneric
194 | PendingLayoutLet
195 | -- | Pending layout from 'do' after 'then' or 'else'.
196 -- The resulting layout can be closed by 'then'/'else' at the same indent.
197 PendingLayoutAfterThenElse
198 deriving (Eq, Show)
199
200 data ModuleLayoutMode
201 = ModuleLayoutOff
202 | ModuleLayoutSeekStart
203 | ModuleLayoutAwaitWhere
204 | ModuleLayoutDone
205 deriving (Eq, Show)
206
207 data LayoutState = LayoutState
208 { layoutContexts :: [LayoutContext],
209 layoutPendingLayout :: !(Maybe PendingLayout),
210 layoutPrevLine :: !(Maybe Int),
211 layoutPrevTokenKind :: !(Maybe LexTokenKind),
212 layoutDelimiterDepth :: !Int,
213 layoutModuleMode :: !ModuleLayoutMode
214 }
215 deriving (Eq, Show)
216
217 data DirectiveUpdate = DirectiveUpdate
218 { directiveLine :: !(Maybe Int),
219 directiveCol :: !(Maybe Int)
220 }
221 deriving (Eq, Show)
222
223 -- | Convenience lexer entrypoint: no extensions, parse as expression/declaration stream.
224 --
225 -- This variant consumes a single strict 'Text' chunk and returns a lazy list of
226 -- tokens. Lexing errors are preserved as 'TkError' tokens instead of causing
227 -- lexing to fail.
228 lexTokens :: Text -> [LexToken]
229 lexTokens = lexTokensFromChunks . (: [])
230
231 -- | Convenience lexer entrypoint for full modules: no explicit extension list.
232 --
233 -- Leading header pragmas are scanned first so module-enabled extensions can be
234 -- applied before token rewrites and top-level layout insertion.
235 lexModuleTokens :: Text -> [LexToken]
236 lexModuleTokens input =
237 lexModuleTokensFromChunks
238 (enabledExtensionsFromSettings (readModuleHeaderExtensionsFromChunks [input]))
239 [input]
240
241 -- | Lex an expression/declaration stream from one or more input chunks.
242 --
243 -- Tokens are produced lazily, so downstream consumers can begin parsing before
244 -- the full source has been scanned.
245 lexTokensFromChunks :: [Text] -> [LexToken]
246 lexTokensFromChunks = lexTokensFromChunksWithExtensions []
247
248 -- | Lex a full module from one or more input chunks with explicit extensions.
249 --
250 -- This variant enables module-body layout insertion in addition to the normal
251 -- token scan and extension rewrites.
252 lexModuleTokensFromChunks :: [Extension] -> [Text] -> [LexToken]
253 lexModuleTokensFromChunks = lexChunksWithExtensions True
254
255 -- | Lex source text using explicit lexer extensions.
256 --
257 -- This runs raw tokenization, extension rewrites, and implicit-layout insertion.
258 -- Module-top layout is /not/ enabled here. Malformed lexemes become 'TkError'
259 -- tokens in the token stream.
260 lexTokensWithExtensions :: [Extension] -> Text -> [LexToken]
261 lexTokensWithExtensions exts input = lexTokensFromChunksWithExtensions exts [input]
262
263 -- | Lex module source text using explicit lexer extensions.
264 --
265 -- Like 'lexTokensWithExtensions', but also enables top-level module-body layout:
266 -- when the source omits explicit braces, virtual layout tokens are inserted
267 -- after @module ... where@ (or from the first non-pragma token in module-less files).
268 lexModuleTokensWithExtensions :: [Extension] -> Text -> [LexToken]
269 lexModuleTokensWithExtensions exts input = lexModuleTokensFromChunks exts [input]
270
271 -- | Internal chunked lexer entrypoint for non-module inputs.
272 --
273 -- This exists so callers can stream input through the same scanner while still
274 -- selecting extension-driven token rewrites.
275 lexTokensFromChunksWithExtensions :: [Extension] -> [Text] -> [LexToken]
276 lexTokensFromChunksWithExtensions = lexChunksWithExtensions False
277
278 -- | Run the full lexer pipeline over chunked input.
279 --
280 -- The scanner operates over the concatenated chunk stream with inline extension
281 -- handling, then the resulting token stream is passed through the layout insertion step.
282 lexChunksWithExtensions :: Bool -> [Extension] -> [Text] -> [LexToken]
283 lexChunksWithExtensions enableModuleLayout exts chunks =
284 applyLayoutTokens enableModuleLayout (scanTokens initialLexerState)
285 where
286 initialLexerState =
287 LexerState
288 { lexerInput = concatMap T.unpack chunks,
289 lexerLine = 1,
290 lexerCol = 1,
291 lexerAtLineStart = True,
292 lexerPending = [],
293 lexerExtensions = exts,
294 lexerPrevTokenKind = Nothing,
295 lexerHadTrivia = True -- Start of file is treated as having leading trivia
296 }
297
298 -- | Read leading module-header pragmas and return parsed LANGUAGE settings.
299 --
300 -- This scans only the pragma/header prefix (allowing whitespace and comments)
301 -- and stops at the first non-pragma token or lexer error token.
302 readModuleHeaderExtensions :: Text -> [ExtensionSetting]
303 readModuleHeaderExtensions input = readModuleHeaderExtensionsFromChunks [input]
304
305 -- | Read leading module-header pragmas from one or more input chunks.
306 --
307 -- This scans only the pragma/header prefix (allowing whitespace and comments)
308 -- and stops at the first non-pragma token or lexer error token.
309 readModuleHeaderExtensionsFromChunks :: [Text] -> [ExtensionSetting]
310 readModuleHeaderExtensionsFromChunks chunks = go (lexTokensFromChunks chunks)
311 where
312 go toks =
313 case toks of
314 LexToken {lexTokenKind = TkPragmaLanguage settings} : rest -> settings <> go rest
315 LexToken {lexTokenKind = TkPragmaWarning _} : rest -> go rest
316 LexToken {lexTokenKind = TkPragmaDeprecated _} : rest -> go rest
317 LexToken {lexTokenKind = TkError _} : _ -> []
318 _ -> []
319
320 enabledExtensionsFromSettings :: [ExtensionSetting] -> [Extension]
321 enabledExtensionsFromSettings = List.foldl' apply []
322 where
323 apply exts setting =
324 case setting of
325 EnableExtension ext
326 | ext `elem` exts -> exts
327 | otherwise -> exts <> [ext]
328 DisableExtension ext -> filter (/= ext) exts
329
330 -- | Produce the lazy stream of raw lexical tokens.
331 --
332 -- Pending synthetic tokens are emitted first, then trivia is skipped, and finally
333 -- the next real token is scanned from the remaining input.
334 --
335 -- The lexer tracks the previous token kind and whether trivia was consumed between
336 -- tokens, which enables inline handling of LexicalNegation and NegativeLiterals.
337 scanTokens :: LexerState -> [LexToken]
338 scanTokens st0 =
339 case lexerPending st0 of
340 tok : rest ->
341 let st0' = st0 {lexerPending = rest, lexerPrevTokenKind = Just (lexTokenKind tok), lexerHadTrivia = False}
342 in tok : scanTokens st0'
343 [] ->
344 let st = skipTrivia st0
345 in case lexerPending st of
346 tok : rest ->
347 let st' = st {lexerPending = rest, lexerPrevTokenKind = Just (lexTokenKind tok), lexerHadTrivia = False}
348 in tok : scanTokens st'
349 []
350 | null (lexerInput st) -> []
351 | otherwise ->
352 -- Reset hadTrivia flag is already set by skipTrivia; we just lex the token
353 let (tok, st') = nextToken st
354 st'' = st' {lexerPrevTokenKind = Just (lexTokenKind tok), lexerHadTrivia = False}
355 in tok : scanTokens st''
356
357 -- | Skip ignorable trivia until the next token boundary.
358 --
359 -- Control directives are treated specially: valid directives update lexer position
360 -- state without emitting a token, while malformed directives enqueue 'TkError'
361 -- tokens for later emission.
362 skipTrivia :: LexerState -> LexerState
363 skipTrivia st = maybe st skipTrivia (consumeTrivia st)
364
365 consumeTrivia :: LexerState -> Maybe LexerState
366 consumeTrivia st
367 | null (lexerInput st) = Nothing
368 | otherwise =
369 case lexerInput st of
370 c : _
371 | c == ' ' || c == '\t' || c == '\r' -> Just (markHadTrivia (consumeWhile (\x -> x == ' ' || x == '\t' || x == '\r') st))
372 | c == '\n' -> Just (markHadTrivia (advanceChars "\n" st))
373 '-' : '-' : rest
374 | isLineComment rest -> Just (markHadTrivia (consumeLineComment st))
375 '{' : '-' : '#' : _ ->
376 case tryConsumeControlPragma st of
377 Just (Nothing, st') -> Just (markHadTrivia st')
378 Just (Just tok, st') -> Just (markHadTrivia st' {lexerPending = lexerPending st' <> [tok]})
379 Nothing ->
380 case tryConsumeKnownPragma st of
381 Just _ -> Nothing
382 Nothing ->
383 markHadTrivia <$> consumeUnknownPragma st
384 '{' : '-' : _ ->
385 Just (markHadTrivia (consumeBlockCommentOrError st))
386 _ ->
387 case tryConsumeLineDirective st of
388 Just (Nothing, st') -> Just (markHadTrivia st')
389 Just (Just tok, st') -> Just (markHadTrivia st' {lexerPending = lexerPending st' <> [tok]})
390 Nothing -> Nothing
391
392 -- | Mark that trivia was consumed
393 markHadTrivia :: LexerState -> LexerState
394 markHadTrivia st = st {lexerHadTrivia = True}
395
396 nextToken :: LexerState -> (LexToken, LexerState)
397 nextToken st =
398 fromMaybe (lexErrorToken st "unexpected character") (firstJust tokenParsers)
399 where
400 tokenParsers =
401 [ lexKnownPragma,
402 lexQuasiQuote,
403 lexHexFloat,
404 lexFloat,
405 lexIntBase,
406 lexInt,
407 lexPromotedQuote,
408 lexChar,
409 lexString,
410 lexSymbol,
411 lexIdentifier,
412 lexNegativeLiteralOrMinus,
413 lexBangOrTildeOperator, -- must come before lexOperator
414 lexOperator
415 ]
416
417 firstJust [] = Nothing
418 firstJust (parser : rest) =
419 case parser st of
420 Just out -> Just out
421 Nothing -> firstJust rest
422
423 applyLayoutTokens :: Bool -> [LexToken] -> [LexToken]
424 applyLayoutTokens enableModuleLayout =
425 go
426 LayoutState
427 { layoutContexts = [],
428 layoutPendingLayout = Nothing,
429 layoutPrevLine = Nothing,
430 layoutPrevTokenKind = Nothing,
431 layoutDelimiterDepth = 0,
432 layoutModuleMode =
433 if enableModuleLayout
434 then ModuleLayoutSeekStart
435 else ModuleLayoutOff
436 }
437 where
438 go st toks =
439 case toks of
440 [] -> closeAllImplicit (layoutContexts st) NoSourceSpan
441 tok : rest ->
442 let stModule = noteModuleLayoutBeforeToken st tok
443 (preInserted, stBeforePending) = closeBeforeToken stModule tok
444 (pendingInserted, stAfterPending, skipBOL) = openPendingLayout stBeforePending tok
445 (bolInserted, stAfterBOL) = if skipBOL then ([], stAfterPending) else bolLayout stAfterPending tok
446 stAfterToken = noteModuleLayoutAfterToken (stepTokenContext stAfterBOL tok) tok
447 stNext =
448 stAfterToken
449 { layoutPrevLine = Just (tokenStartLine tok),
450 layoutPrevTokenKind = Just (lexTokenKind tok)
451 }
452 in preInserted <> pendingInserted <> bolInserted <> (tok : go stNext rest)
453
454 noteModuleLayoutBeforeToken :: LayoutState -> LexToken -> LayoutState
455 noteModuleLayoutBeforeToken st tok =
456 case layoutModuleMode st of
457 ModuleLayoutSeekStart ->
458 case lexTokenKind tok of
459 TkPragmaLanguage _ -> st
460 TkPragmaWarning _ -> st
461 TkPragmaDeprecated _ -> st
462 TkKeywordModule -> st {layoutModuleMode = ModuleLayoutAwaitWhere}
463 _ -> st {layoutModuleMode = ModuleLayoutDone, layoutPendingLayout = Just PendingLayoutGeneric}
464 _ -> st
465
466 noteModuleLayoutAfterToken :: LayoutState -> LexToken -> LayoutState
467 noteModuleLayoutAfterToken st tok =
468 case layoutModuleMode st of
469 ModuleLayoutAwaitWhere
470 | lexTokenKind tok == TkKeywordWhere ->
471 st {layoutModuleMode = ModuleLayoutDone, layoutPendingLayout = Just PendingLayoutGeneric}
472 _ -> st
473
474 openPendingLayout :: LayoutState -> LexToken -> ([LexToken], LayoutState, Bool)
475 openPendingLayout st tok =
476 case layoutPendingLayout st of
477 Nothing -> ([], st, False)
478 Just pending ->
479 case lexTokenKind tok of
480 TkSpecialLBrace -> ([], st {layoutPendingLayout = Nothing}, False)
481 _ ->
482 let col = tokenStartCol tok
483 parentIndent = currentLayoutIndent (layoutContexts st)
484 openTok = virtualSymbolToken "{" (lexTokenSpan tok)
485 closeTok = virtualSymbolToken "}" (lexTokenSpan tok)
486 newContext =
487 case pending of
488 PendingLayoutGeneric -> LayoutImplicit col
489 PendingLayoutLet -> LayoutImplicitLet col
490 PendingLayoutAfterThenElse -> LayoutImplicitAfterThenElse col
491 in if col <= parentIndent
492 then ([openTok, closeTok], st {layoutPendingLayout = Nothing}, False)
493 else
494 ( [openTok],
495 st
496 { layoutPendingLayout = Nothing,
497 layoutContexts = newContext : layoutContexts st
498 },
499 True
500 )
501
502 closeBeforeToken :: LayoutState -> LexToken -> ([LexToken], LayoutState)
503 closeBeforeToken st tok =
504 case lexTokenKind tok of
505 TkKeywordIn ->
506 let (inserted, contexts') = closeLeadingImplicitLets (lexTokenSpan tok) (layoutContexts st)
507 in (inserted, st {layoutContexts = contexts'})
508 TkSpecialComma
509 | layoutDelimiterDepth st == 0 ->
510 let (inserted, contexts') = closeLeadingImplicitLets (lexTokenSpan tok) (layoutContexts st)
511 in (inserted, st {layoutContexts = contexts'})
512 -- Close implicit layout contexts before closing delimiters (parse-error rule)
513 TkSpecialRParen ->
514 let (inserted, contexts') = closeAllImplicitBeforeDelimiter (lexTokenSpan tok) (layoutContexts st)
515 in (inserted, st {layoutContexts = contexts'})
516 TkSpecialRBracket ->
517 let (inserted, contexts') = closeAllImplicitBeforeDelimiter (lexTokenSpan tok) (layoutContexts st)
518 in (inserted, st {layoutContexts = contexts'})
519 TkSpecialRBrace ->
520 let (inserted, contexts') = closeAllImplicitBeforeDelimiter (lexTokenSpan tok) (layoutContexts st)
521 in (inserted, st {layoutContexts = contexts'})
522 -- Close implicit layout contexts before 'then' and 'else' keywords (parse-error rule)
523 -- These keywords cannot appear inside a do block, so we close contexts at >= their column.
524 TkKeywordThen ->
525 let col = tokenStartCol tok
526 (inserted, contexts') = closeForDedentInclusive col (lexTokenSpan tok) (layoutContexts st)
527 in (inserted, st {layoutContexts = contexts'})
528 TkKeywordElse ->
529 let col = tokenStartCol tok
530 (inserted, contexts') = closeForDedentInclusive col (lexTokenSpan tok) (layoutContexts st)
531 in (inserted, st {layoutContexts = contexts'})
532 -- Close implicit layout contexts before 'where' keyword (parse-error rule)
533 -- 'where' at the same column as an implicit layout closes that layout,
534 -- allowing it to attach to the enclosing definition.
535 TkKeywordWhere ->
536 let col = tokenStartCol tok
537 (inserted, contexts') = closeForDedentInclusiveAll col (lexTokenSpan tok) (layoutContexts st)
538 in (inserted, st {layoutContexts = contexts'})
539 _ -> ([], st)
540
541 bolLayout :: LayoutState -> LexToken -> ([LexToken], LayoutState)
542 bolLayout st tok
543 | not (isBOL st tok) = ([], st)
544 | otherwise =
545 let col = tokenStartCol tok
546 (inserted, contexts') = closeForDedent col (lexTokenSpan tok) (layoutContexts st)
547 eqSemi =
548 case currentLayoutIndentMaybe contexts' of
549 Just indent
550 | col == indent,
551 not (suppressesVirtualSemicolon tok) ->
552 [virtualSymbolToken ";" (lexTokenSpan tok)]
553 _ -> []
554 in (inserted <> eqSemi, st {layoutContexts = contexts'})
555
556 suppressesVirtualSemicolon :: LexToken -> Bool
557 suppressesVirtualSemicolon tok =
558 case lexTokenKind tok of
559 TkKeywordThen -> True
560 TkKeywordElse -> True
561 TkReservedDoubleArrow -> True -- =>
562 TkReservedRightArrow -> True -- ->
563 TkReservedEquals -> True -- =
564 TkReservedPipe -> True
565 TkReservedDoubleColon -> True -- ::
566 _ -> False
567
568 closeForDedent :: Int -> SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
569 closeForDedent col anchor = go []
570 where
571 go acc contexts =
572 case contexts of
573 LayoutImplicit indent : rest
574 | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
575 | otherwise -> (reverse acc, contexts)
576 LayoutImplicitLet indent : rest
577 | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
578 | otherwise -> (reverse acc, contexts)
579 LayoutImplicitAfterThenElse indent : rest
580 | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
581 | otherwise -> (reverse acc, contexts)
582 _ -> (reverse acc, contexts)
583
584 -- | Close layout contexts opened after 'then do' or 'else do' when encountering
585 -- 'then' or 'else' at the same or lesser indent. This handles the parse-error rule
586 -- for these specific cases where the keyword cannot be part of the do block.
587 --
588 -- This function first closes any implicit layouts with indent > col (regular dedent),
589 -- then closes LayoutImplicitAfterThenElse contexts where col <= indent.
590 -- This ensures that nested layouts (like case blocks) are closed before
591 -- the then/else-specific layout closing.
592 closeForDedentInclusive :: Int -> SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
593 closeForDedentInclusive col anchor = go []
594 where
595 go acc contexts =
596 case contexts of
597 -- Close any implicit layout with indent > col (dedent rule)
598 LayoutImplicit indent : rest
599 | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
600 | otherwise -> (reverse acc, contexts)
601 LayoutImplicitLet indent : rest
602 | col < indent -> go (virtualSymbolToken "}" anchor : acc) rest
603 | otherwise -> (reverse acc, contexts)
604 -- Close LayoutImplicitAfterThenElse where col <= indent (parse-error rule)
605 LayoutImplicitAfterThenElse indent : rest
606 | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
607 | otherwise -> (reverse acc, contexts)
608 _ -> (reverse acc, contexts)
609
610 -- | Close all implicit layout contexts at or above the given column.
611 -- Used for 'where' which needs to close all enclosing implicit layouts
612 -- (not just LayoutImplicitAfterThenElse like then/else).
613 closeForDedentInclusiveAll :: Int -> SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
614 closeForDedentInclusiveAll col anchor = go []
615 where
616 go acc contexts =
617 case contexts of
618 LayoutImplicit indent : rest
619 | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
620 | otherwise -> (reverse acc, contexts)
621 LayoutImplicitLet indent : rest
622 | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
623 | otherwise -> (reverse acc, contexts)
624 LayoutImplicitAfterThenElse indent : rest
625 | col <= indent -> go (virtualSymbolToken "}" anchor : acc) rest
626 | otherwise -> (reverse acc, contexts)
627 _ -> (reverse acc, contexts)
628
629 closeAllImplicit :: [LayoutContext] -> SourceSpan -> [LexToken]
630 closeAllImplicit contexts anchor =
631 [virtualSymbolToken "}" anchor | ctx <- contexts, isImplicitLayoutContext ctx]
632
633 closeLeadingImplicitLets :: SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
634 closeLeadingImplicitLets anchor = go []
635 where
636 go acc contexts =
637 case contexts of
638 LayoutImplicitLet _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
639 _ -> (reverse acc, contexts)
640
641 -- | Close all implicit layout contexts up to (but not including) the first explicit context.
642 -- Used to implement the Haskell Report's "parse-error" rule for closing delimiters.
643 closeAllImplicitBeforeDelimiter :: SourceSpan -> [LayoutContext] -> ([LexToken], [LayoutContext])
644 closeAllImplicitBeforeDelimiter anchor = go []
645 where
646 go acc contexts =
647 case contexts of
648 LayoutImplicit _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
649 LayoutImplicitLet _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
650 LayoutImplicitAfterThenElse _ : rest -> go (virtualSymbolToken "}" anchor : acc) rest
651 _ -> (reverse acc, contexts)
652
653 stepTokenContext :: LayoutState -> LexToken -> LayoutState
654 stepTokenContext st tok =
655 case lexTokenKind tok of
656 TkKeywordDo
657 | layoutPrevTokenKind st == Just TkKeywordThen
658 || layoutPrevTokenKind st == Just TkKeywordElse ->
659 st {layoutPendingLayout = Just PendingLayoutAfterThenElse}
660 | otherwise -> st {layoutPendingLayout = Just PendingLayoutGeneric}
661 TkKeywordOf -> st {layoutPendingLayout = Just PendingLayoutGeneric}
662 TkKeywordCase
663 | layoutPrevTokenKind st == Just TkReservedBackslash ->
664 st {layoutPendingLayout = Just PendingLayoutGeneric}
665 | otherwise -> st
666 TkKeywordLet -> st {layoutPendingLayout = Just PendingLayoutLet}
667 TkKeywordWhere -> st {layoutPendingLayout = Just PendingLayoutGeneric}
668 TkSpecialLParen ->
669 st
670 { layoutDelimiterDepth = layoutDelimiterDepth st + 1,
671 layoutContexts = LayoutDelimiter : layoutContexts st
672 }
673 TkSpecialLBracket ->
674 st
675 { layoutDelimiterDepth = layoutDelimiterDepth st + 1,
676 layoutContexts = LayoutDelimiter : layoutContexts st
677 }
678 TkSpecialRParen ->
679 st
680 { layoutDelimiterDepth = max 0 (layoutDelimiterDepth st - 1),
681 layoutContexts = popToDelimiter (layoutContexts st)
682 }
683 TkSpecialRBracket ->
684 st
685 { layoutDelimiterDepth = max 0 (layoutDelimiterDepth st - 1),
686 layoutContexts = popToDelimiter (layoutContexts st)
687 }
688 TkSpecialLBrace -> st {layoutContexts = LayoutExplicit : layoutContexts st}
689 TkSpecialRBrace -> st {layoutContexts = popOneContext (layoutContexts st)}
690 _ -> st
691
692 -- | Pop the layout context stack up to and including the nearest LayoutDelimiter.
693 popToDelimiter :: [LayoutContext] -> [LayoutContext]
694 popToDelimiter contexts =
695 case contexts of
696 LayoutDelimiter : rest -> rest
697 _ : rest -> popToDelimiter rest
698 [] -> []
699
700 popOneContext :: [LayoutContext] -> [LayoutContext]
701 popOneContext contexts =
702 case contexts of
703 _ : rest -> rest
704 [] -> []
705
706 currentLayoutIndent :: [LayoutContext] -> Int
707 currentLayoutIndent contexts = fromMaybe 0 (currentLayoutIndentMaybe contexts)
708
709 currentLayoutIndentMaybe :: [LayoutContext] -> Maybe Int
710 currentLayoutIndentMaybe contexts =
711 case contexts of
712 LayoutImplicit indent : _ -> Just indent
713 LayoutImplicitLet indent : _ -> Just indent
714 LayoutImplicitAfterThenElse indent : _ -> Just indent
715 _ -> Nothing
716
717 isImplicitLayoutContext :: LayoutContext -> Bool
718 isImplicitLayoutContext ctx =
719 case ctx of
720 LayoutImplicit _ -> True
721 LayoutImplicitLet _ -> True
722 LayoutImplicitAfterThenElse _ -> True
723 LayoutExplicit -> False
724 LayoutDelimiter -> False
725
726 isBOL :: LayoutState -> LexToken -> Bool
727 isBOL st tok =
728 case layoutPrevLine st of
729 Just prevLine -> tokenStartLine tok > prevLine
730 Nothing -> False
731
732 tokenStartLine :: LexToken -> Int
733 tokenStartLine tok =
734 case lexTokenSpan tok of
735 SourceSpan line _ _ _ -> line
736 NoSourceSpan -> 1
737
738 tokenStartCol :: LexToken -> Int
739 tokenStartCol tok =
740 case lexTokenSpan tok of
741 SourceSpan _ col _ _ -> col
742 NoSourceSpan -> 1
743
744 virtualSymbolToken :: Text -> SourceSpan -> LexToken
745 virtualSymbolToken sym span' =
746 LexToken
747 { lexTokenKind = case sym of
748 "{" -> TkSpecialLBrace
749 "}" -> TkSpecialRBrace
750 ";" -> TkSpecialSemicolon
751 _ -> error ("virtualSymbolToken: unexpected symbol " ++ T.unpack sym),
752 lexTokenText = sym,
753 lexTokenSpan = span'
754 }
755
756 lexKnownPragma :: LexerState -> Maybe (LexToken, LexerState)
757 lexKnownPragma st
758 | Just ((raw, kind), st') <- parsePragmaLike parseLanguagePragma st = Just (mkToken st st' raw kind, st')
759 | Just ((raw, kind), st') <- parsePragmaLike parseOptionsPragma st = Just (mkToken st st' raw kind, st')
760 | Just ((raw, kind), st') <- parsePragmaLike parseWarningPragma st = Just (mkToken st st' raw kind, st')
761 | Just ((raw, kind), st') <- parsePragmaLike parseDeprecatedPragma st = Just (mkToken st st' raw kind, st')
762 | otherwise = Nothing
763
764 parsePragmaLike :: (String -> Maybe (Int, (Text, LexTokenKind))) -> LexerState -> Maybe ((Text, LexTokenKind), LexerState)
765 parsePragmaLike parser st = do
766 (n, out) <- parser (lexerInput st)
767 pure (out, advanceChars (take n (lexerInput st)) st)
768
769 lexIdentifier :: LexerState -> Maybe (LexToken, LexerState)
770 lexIdentifier st =
771 case lexerInput st of
772 c : rest
773 | isIdentStart c ->
774 let (seg, rest0) = span isIdentTail rest
775 firstChunk = c : seg
776 (consumed, rest1, isQualified) = gatherQualified firstChunk rest0
777 in -- Check if we have a qualified operator (e.g., Prelude.+)
778 case (isQualified || isAsciiUpper c, rest1) of
779 (True, '.' : opChar : opRest)
780 | isSymbolicOpCharNotDot opChar ->
781 -- This is a qualified operator like Prelude.+ or A.B.C.:++
782 let (opChars, _) = span isSymbolicOpChar (opChar : opRest)
783 fullOp = consumed <> "." <> opChars
784 opTxt = T.pack fullOp
785 kind =
786 if opChar == ':'
787 then TkQConSym opTxt
788 else TkQVarSym opTxt
789 st' = advanceChars fullOp st
790 in Just (mkToken st st' opTxt kind, st')
791 _ ->
792 -- Regular identifier
793 let ident = T.pack consumed
794 kind = classifyIdentifier c isQualified ident
795 st' = advanceChars consumed st
796 in Just (mkToken st st' ident kind, st')
797 _ -> Nothing
798 where
799 -- Returns (consumed, remaining, isQualified)
800 gatherQualified acc chars =
801 case chars of
802 '.' : c' : more
803 | isIdentStart c' ->
804 let (seg, rest) = span isIdentTail more
805 in gatherQualified (acc <> "." <> [c'] <> seg) rest
806 _ -> (acc, chars, '.' `elem` acc)
807
808 -- Check for symbol char that is not '.' to avoid consuming module path dots
809 isSymbolicOpCharNotDot c = isSymbolicOpChar c && c /= '.'
810
811 classifyIdentifier firstChar isQualified ident
812 | isQualified =
813 -- Qualified name: use final part to determine var/con
814 let finalPart = T.takeWhileEnd (/= '.') ident
815 firstCharFinal = T.head finalPart
816 in if isAsciiUpper firstCharFinal
817 then TkQConId ident
818 else TkQVarId ident
819 | otherwise =
820 -- Unqualified: check for keyword first
821 case keywordTokenKind ident of
822 Just kw -> kw
823 Nothing ->
824 if isAsciiUpper firstChar
825 then TkConId ident
826 else TkVarId ident
827
828 -- | Handle minus in the context of NegativeLiterals and LexicalNegation extensions.
829 --
830 -- This function is called when the input starts with '-' and either NegativeLiterals
831 -- or LexicalNegation is enabled. It handles the following cases:
832 --
833 -- 1. NegativeLiterals: If '-' is immediately followed by a numeric literal (no space),
834 -- | Handle minus in the context of NegativeLiterals and LexicalNegation extensions.
835 --
836 -- When NegativeLiterals is enabled and context allows, attempts to lex a negative
837 -- literal by consuming '-' and delegating to existing number lexers.
838 --
839 -- When LexicalNegation is enabled, emits TkPrefixMinus or TkMinusOperator based
840 -- on position.
841 --
842 -- Otherwise, return Nothing and let lexOperator handle it.
843 lexNegativeLiteralOrMinus :: LexerState -> Maybe (LexToken, LexerState)
844 lexNegativeLiteralOrMinus st
845 | not hasNegExt = Nothing
846 | not (isStandaloneMinus (lexerInput st)) = Nothing
847 | otherwise =
848 let prevAllows = allowsMergeOrPrefix (lexerPrevTokenKind st) (lexerHadTrivia st)
849 rest = drop 1 (lexerInput st) -- input after '-'
850 in if NegativeLiterals `elem` lexerExtensions st && prevAllows
851 then case tryLexNumberAfterMinus st of
852 Just result -> Just result
853 Nothing -> lexMinusOperator st rest prevAllows
854 else lexMinusOperator st rest prevAllows
855 where
856 hasNegExt =
857 NegativeLiterals `elem` lexerExtensions st
858 || LexicalNegation `elem` lexerExtensions st
859
860 -- | Check if input starts with a standalone '-' (not part of ->, -<, etc.)
861 isStandaloneMinus :: String -> Bool
862 isStandaloneMinus input =
863 case input of
864 '-' : c : _ | isSymbolicOpChar c && c /= '-' -> False -- part of multi-char op
865 '-' : _ -> True
866 _ -> False
867
868 -- | Try to lex a negative number by delegating to existing number lexers.
869 -- Consumes '-', runs number lexers on remainder, negates result if successful.
870 tryLexNumberAfterMinus :: LexerState -> Maybe (LexToken, LexerState)
871 tryLexNumberAfterMinus st = do
872 -- Create a temporary state positioned after the '-'
873 let stAfterMinus = advanceChars "-" st
874 -- Try existing number lexers in order (most specific first)
875 (numTok, stFinal) <- firstJust numberLexers stAfterMinus
876 -- Build combined negative token
877 Just (negateToken st numTok, stFinal)
878 where
879 numberLexers = [lexHexFloat, lexFloat, lexIntBase, lexInt]
880
881 firstJust [] _ = Nothing
882 firstJust (f : fs) s = case f s of
883 Just result -> Just result
884 Nothing -> firstJust fs s
885
886 -- | Negate a numeric token and adjust its span/text to include the leading '-'.
887 negateToken :: LexerState -> LexToken -> LexToken
888 negateToken stBefore numTok =
889 LexToken
890 { lexTokenKind = negateKind (lexTokenKind numTok),
891 lexTokenText = "-" <> lexTokenText numTok,
892 lexTokenSpan = extendSpanLeft (lexTokenSpan numTok)
893 }
894 where
895 negateKind k = case k of
896 TkInteger n -> TkInteger (negate n)
897 TkIntegerBase n repr -> TkIntegerBase (negate n) ("-" <> repr)
898 TkFloat n repr -> TkFloat (negate n) ("-" <> repr)
899 other -> other -- shouldn't happen
900
901 -- Extend span to start at the '-' position
902 extendSpanLeft sp = case sp of
903 SourceSpan _ _ endLine endCol ->
904 SourceSpan (lexerLine stBefore) (lexerCol stBefore) endLine endCol
905 NoSourceSpan -> NoSourceSpan
906
907 -- | Emit TkPrefixMinus or TkMinusOperator based on LexicalNegation rules.
908 lexMinusOperator :: LexerState -> String -> Bool -> Maybe (LexToken, LexerState)
909 lexMinusOperator st rest prevAllows
910 | LexicalNegation `notElem` lexerExtensions st = Nothing
911 | otherwise =
912 let st' = advanceChars "-" st
913 kind =
914 if prevAllows && canStartNegatedAtom rest
915 then TkPrefixMinus
916 else TkMinusOperator
917 in Just (mkToken st st' "-" kind, st')
918
919 -- | Check if the preceding token context allows a merge (NegativeLiterals) or
920 -- prefix minus (LexicalNegation).
921 --
922 -- The merge/prefix is allowed when:
923 -- - There is no previous token (start of input)
924 -- - There was whitespace/trivia before the '-'
925 -- - The previous token is an operator or punctuation that allows tight unary prefix
926 allowsMergeOrPrefix :: Maybe LexTokenKind -> Bool -> Bool
927 allowsMergeOrPrefix prev hadTrivia =
928 case prev of
929 Nothing -> True
930 Just _ | hadTrivia -> True
931 Just prevKind -> prevTokenAllowsTightPrefix prevKind
932
933 -- | Check if the preceding token allows a tight unary prefix (like negation).
934 -- This is used when there's no whitespace between the previous token and '-'.
935 prevTokenAllowsTightPrefix :: LexTokenKind -> Bool
936 prevTokenAllowsTightPrefix kind =
937 case kind of
938 TkSpecialLParen -> True
939 TkSpecialLBracket -> True
940 TkSpecialLBrace -> True
941 TkSpecialComma -> True
942 TkSpecialSemicolon -> True
943 TkVarSym _ -> True
944 TkConSym _ -> True
945 TkQVarSym _ -> True
946 TkQConSym _ -> True
947 TkMinusOperator -> True
948 TkPrefixMinus -> True
949 TkReservedEquals -> True
950 TkReservedLeftArrow -> True
951 TkReservedRightArrow -> True
952 TkReservedDoubleArrow -> True
953 TkReservedDoubleColon -> True
954 TkReservedPipe -> True
955 TkReservedBackslash -> True
956 _ -> False
957
958 -- | Check if the given input could start a negated atom (for LexicalNegation).
959 canStartNegatedAtom :: String -> Bool
960 canStartNegatedAtom rest =
961 case rest of
962 [] -> False
963 c : _
964 | isIdentStart c -> True -- identifier
965 | isDigit c -> True -- number
966 | c == '\'' -> True -- char literal
967 | c == '"' -> True -- string literal
968 | c == '(' -> True -- parenthesized expression
969 | c == '[' -> True -- list/TH brackets
970 | c == '\\' -> True -- lambda
971 | c == '-' -> True -- nested negation
972 | otherwise -> False
973
974 -- | Whitespace-sensitive lexing for ! and ~ operators (GHC proposal 0229).
975 --
976 -- Per the proposal, these operators are classified based on surrounding whitespace:
977 -- - Prefix occurrence: whitespace before, no whitespace after → bang/lazy pattern
978 -- - Otherwise (loose infix, tight infix, suffix) → regular operator
979 --
980 -- Examples:
981 -- a ! b -- loose infix (operator)
982 -- a!b -- tight infix (operator)
983 -- a !b -- prefix (bang pattern)
984 -- a! b -- suffix (operator)
985 lexBangOrTildeOperator :: LexerState -> Maybe (LexToken, LexerState)
986 lexBangOrTildeOperator st =
987 case lexerInput st of
988 '!' : rest -> lexPrefixSensitiveOp st '!' "!" TkPrefixBang rest
989 '~' : rest -> lexPrefixSensitiveOp st '~' "~" TkPrefixTilde rest
990 _ -> Nothing
991
992 -- | Lex a whitespace-sensitive prefix operator.
993 -- Returns TkPrefixBang/TkPrefixTilde if in prefix position, otherwise Nothing
994 -- to let lexOperator handle it as a regular VarSym.
995 --
996 -- Per GHC proposal 0229, prefix position is determined by:
997 -- - Whitespace before the operator, OR
998 -- - Previous token is an opening bracket/punctuation that allows tight prefix
999 -- AND no whitespace after (next char can start a pattern atom).
1000 lexPrefixSensitiveOp :: LexerState -> Char -> String -> LexTokenKind -> String -> Maybe (LexToken, LexerState)
1001 lexPrefixSensitiveOp st opChar opStr prefixKind rest
1002 -- Only handle single-character ! or ~ (not part of multi-char operator like !=)
1003 | isMultiCharOp rest = Nothing
1004 -- Prefix position: (whitespace before OR opening token before) AND next char can start a pattern atom
1005 | isPrefixPosition && canStartPrefixPatternAtom rest =
1006 let st' = advanceChars opStr st
1007 in Just (mkToken st st' (T.singleton opChar) prefixKind, st')
1008 -- Otherwise, let lexOperator handle it as a regular operator
1009 | otherwise = Nothing
1010 where
1011 -- Check if rest starts with another symbolic operator char (making this a multi-char op)
1012 isMultiCharOp (c : _) = isSymbolicOpChar c
1013 isMultiCharOp [] = False
1014 -- Prefix position is allowed when:
1015 -- - There is no previous token (start of input)
1016 -- - There was whitespace/trivia before the operator
1017 -- - The previous token is an opening bracket or punctuation that allows tight prefix
1018 isPrefixPosition =
1019 case lexerPrevTokenKind st of
1020 Nothing -> True -- start of input
1021 Just prevKind
1022 | lexerHadTrivia st -> True -- had whitespace before
1023 | otherwise -> prevTokenAllowsTightPrefix prevKind -- opening bracket, etc.
1024
1025 -- | Check if the given input could start a pattern atom (for prefix ! and ~).
1026 -- This is similar to canStartNegatedAtom but tailored for patterns.
1027 canStartPrefixPatternAtom :: String -> Bool
1028 canStartPrefixPatternAtom rest =
1029 case rest of
1030 [] -> False
1031 c : _
1032 | isIdentStart c -> True -- identifier (variable or constructor)
1033 | isDigit c -> True -- numeric literal
1034 | c == '\'' -> True -- char literal
1035 | c == '"' -> True -- string literal
1036 | c == '(' -> True -- parenthesized pattern or tuple
1037 | c == '[' -> True -- list pattern
1038 | c == '_' -> True -- wildcard
1039 | c == '!' -> True -- nested bang pattern
1040 | c == '~' -> True -- nested lazy pattern
1041 | otherwise -> False
1042
1043 lexOperator :: LexerState -> Maybe (LexToken, LexerState)
1044 lexOperator st =
1045 case span isSymbolicOpChar (lexerInput st) of
1046 (op@(c : _), _) ->
1047 let txt = T.pack op
1048 st' = advanceChars op st
1049 hasUnicode = UnicodeSyntax `elem` lexerExtensions st
1050 kind = case reservedOpTokenKind txt of
1051 Just reserved -> reserved
1052 Nothing
1053 | hasUnicode -> unicodeOpTokenKind txt c
1054 | c == ':' -> TkConSym txt
1055 | otherwise -> TkVarSym txt
1056 in Just (mkToken st st' txt kind, st')
1057 _ -> Nothing
1058
1059 -- | Map Unicode operators to their ASCII equivalents when UnicodeSyntax is enabled.
1060 -- Returns the appropriate token kind for known Unicode operators, or falls back
1061 -- to TkVarSym/TkConSym based on whether the first character is ':'.
1062 unicodeOpTokenKind :: Text -> Char -> LexTokenKind
1063 unicodeOpTokenKind txt firstChar =
1064 case T.unpack txt of
1065 "∷" -> TkReservedDoubleColon -- :: (proportion)
1066 "⇒" -> TkReservedDoubleArrow -- => (rightwards double arrow)
1067 "→" -> TkReservedRightArrow -- -> (rightwards arrow)
1068 "←" -> TkReservedLeftArrow -- <- (leftwards arrow)
1069 "∀" -> TkVarId "forall" -- forall (for all)
1070 "★" -> TkVarSym "*" -- star (for kind signatures)
1071 "⤙" -> TkVarSym "-<" -- -< (leftwards arrow-tail)
1072 "⤚" -> TkVarSym ">-" -- >- (rightwards arrow-tail)
1073 "⤛" -> TkVarSym "-<<" -- -<< (leftwards double arrow-tail)
1074 "⤜" -> TkVarSym ">>-" -- >>- (rightwards double arrow-tail)
1075 "⦇" -> TkVarSym "(|" -- (| left banana bracket
1076 "⦈" -> TkVarSym "|)" -- right banana bracket |)
1077 "⟦" -> TkVarSym "[|" -- [| left semantic bracket
1078 "⟧" -> TkVarSym "|]" -- right semantic bracket |]
1079 "⊸" -> TkVarSym "%1->" -- %1-> (linear arrow)
1080 _
1081 | firstChar == ':' -> TkConSym txt
1082 | otherwise -> TkVarSym txt
1083
1084 lexSymbol :: LexerState -> Maybe (LexToken, LexerState)
1085 lexSymbol st =
1086 firstJust
1087 [ ("(", TkSpecialLParen),
1088 (")", TkSpecialRParen),
1089 ("[", TkSpecialLBracket),
1090 ("]", TkSpecialRBracket),
1091 ("{", TkSpecialLBrace),
1092 ("}", TkSpecialRBrace),
1093 (",", TkSpecialComma),
1094 (";", TkSpecialSemicolon),
1095 ("`", TkSpecialBacktick)
1096 ]
1097 where
1098 firstJust xs =
1099 case xs of
1100 [] -> Nothing
1101 (txt, kind) : rest ->
1102 if txt `List.isPrefixOf` lexerInput st
1103 then
1104 let st' = advanceChars txt st
1105 in Just (mkToken st st' (T.pack txt) kind, st')
1106 else firstJust rest
1107
1108 lexIntBase :: LexerState -> Maybe (LexToken, LexerState)
1109 lexIntBase st =
1110 case lexerInput st of
1111 '0' : base : rest
1112 | base `elem` ("xXoObB" :: String) ->
1113 let allowUnderscores = NumericUnderscores `elem` lexerExtensions st
1114 isDigitChar
1115 | base `elem` ("xX" :: String) = isHexDigit
1116 | base `elem` ("oO" :: String) = isOctDigit
1117 | otherwise = (`elem` ("01" :: String))
1118 (digitsRaw, _) = takeDigitsWithLeadingUnderscores allowUnderscores isDigitChar rest
1119 in if null digitsRaw
1120 then Nothing
1121 else
1122 let raw = '0' : base : digitsRaw
1123 txt = T.pack raw
1124 n
1125 | base `elem` ("xX" :: String) = readHexLiteral txt
1126 | base `elem` ("oO" :: String) = readOctLiteral txt
1127 | otherwise = readBinLiteral txt
1128 st' = advanceChars raw st
1129 in Just (mkToken st st' txt (TkIntegerBase n txt), st')
1130 _ -> Nothing
1131
1132 lexHexFloat :: LexerState -> Maybe (LexToken, LexerState)
1133 lexHexFloat st = do
1134 ('0' : x : rest) <- Just (lexerInput st)
1135 if x `notElem` ("xX" :: String)
1136 then Nothing
1137 else do
1138 let (intDigits, rest1) = span isHexDigit rest
1139 if null intDigits
1140 then Nothing
1141 else do
1142 let (mFracDigits, rest2) =
1143 case rest1 of
1144 '.' : more ->
1145 let (frac, rest') = span isHexDigit more
1146 in (Just frac, rest')
1147 _ -> (Nothing, rest1)
1148 expo@(_ : expoRest) <- takeHexExponent rest2
1149 let fracDigits = fromMaybe "" mFracDigits
1150 if null expoRest
1151 then Nothing
1152 else
1153 let dotAndFrac =
1154 case mFracDigits of
1155 Just ds -> '.' : ds
1156 Nothing -> ""
1157 raw = '0' : x : intDigits <> dotAndFrac <> expo
1158 txt = T.pack raw
1159 value = parseHexFloatLiteral intDigits fracDigits expo
1160 st' = advanceChars raw st
1161 in Just (mkToken st st' txt (TkFloat value txt), st')
1162
1163 lexFloat :: LexerState -> Maybe (LexToken, LexerState)
1164 lexFloat st =
1165 let allowUnderscores = NumericUnderscores `elem` lexerExtensions st
1166 (lhsRaw, rest) = takeDigitsWithUnderscores allowUnderscores isDigit (lexerInput st)
1167 in if null lhsRaw
1168 then Nothing
1169 else case rest of
1170 '.' : d : more
1171 | isDigit d ->
1172 let (rhsRaw, rest') = takeDigitsWithUnderscores allowUnderscores isDigit (d : more)
1173 (expo, _) = takeExponent allowUnderscores rest'
1174 raw = lhsRaw <> "." <> rhsRaw <> expo
1175 txt = T.pack raw
1176 normalized = filter (/= '_') raw
1177 st' = advanceChars raw st
1178 in Just (mkToken st st' txt (TkFloat (read normalized) txt), st')
1179 _ ->
1180 case takeExponent allowUnderscores rest of
1181 ("", _) -> Nothing
1182 (expo, _) ->
1183 let raw = lhsRaw <> expo
1184 txt = T.pack raw
1185 normalized = filter (/= '_') raw
1186 st' = advanceChars raw st
1187 in Just (mkToken st st' txt (TkFloat (read normalized) txt), st')
1188
1189 lexInt :: LexerState -> Maybe (LexToken, LexerState)
1190 lexInt st =
1191 let allowUnderscores = NumericUnderscores `elem` lexerExtensions st
1192 (digitsRaw, _) = takeDigitsWithUnderscores allowUnderscores isDigit (lexerInput st)
1193 in if null digitsRaw
1194 then Nothing
1195 else
1196 let txt = T.pack digitsRaw
1197 digits = filter (/= '_') digitsRaw
1198 st' = advanceChars digitsRaw st
1199 in Just (mkToken st st' txt (TkInteger (read digits)), st')
1200
1201 lexPromotedQuote :: LexerState -> Maybe (LexToken, LexerState)
1202 lexPromotedQuote st
1203 | DataKinds `notElem` lexerExtensions st = Nothing
1204 | otherwise =
1205 case lexerInput st of
1206 '\'' : rest
1207 | isValidCharLiteral rest -> Nothing
1208 | isPromotionStart rest ->
1209 let st' = advanceChars "'" st
1210 in Just (mkToken st st' "'" (TkVarSym "'"), st')
1211 | otherwise -> Nothing
1212 _ -> Nothing
1213 where
1214 isValidCharLiteral chars =
1215 case scanQuoted '\'' chars of
1216 Right (body, _) -> isJust (readMaybeChar ('\'' : body <> "'"))
1217 Left _ -> False
1218
1219 isPromotionStart chars =
1220 case chars of
1221 c : _
1222 | c == '[' -> True
1223 | c == '(' -> True
1224 | c == ':' -> True
1225 | isAsciiUpper c -> True
1226 _ -> False
1227
1228 lexChar :: LexerState -> Maybe (LexToken, LexerState)
1229 lexChar st =
1230 case lexerInput st of
1231 '\'' : rest ->
1232 case scanQuoted '\'' rest of
1233 Right (body, _) ->
1234 let raw = '\'' : body <> "'"
1235 st' = advanceChars raw st
1236 in case readMaybeChar raw of
1237 Just c -> Just (mkToken st st' (T.pack raw) (TkChar c), st')
1238 Nothing -> Just (mkErrorToken st st' (T.pack raw) "invalid char literal", st')
1239 Left raw ->
1240 let full = '\'' : raw
1241 st' = advanceChars full st
1242 in Just (mkErrorToken st st' (T.pack full) "unterminated char literal", st')
1243 _ -> Nothing
1244
1245 lexString :: LexerState -> Maybe (LexToken, LexerState)
1246 lexString st =
1247 case lexerInput st of
1248 '"' : rest ->
1249 case scanQuoted '"' rest of
1250 Right (body, _) ->
1251 let raw = "\"" <> body <> "\""
1252 decoded =
1253 case reads raw of
1254 [(str, "")] -> T.pack str
1255 _ -> T.pack body
1256 st' = advanceChars raw st
1257 in Just (mkToken st st' (T.pack raw) (TkString decoded), st')
1258 Left raw ->
1259 let full = '"' : raw
1260 st' = advanceChars full st
1261 in Just (mkErrorToken st st' (T.pack full) "unterminated string literal", st')
1262 _ -> Nothing
1263
1264 lexQuasiQuote :: LexerState -> Maybe (LexToken, LexerState)
1265 lexQuasiQuote st =
1266 case lexerInput st of
1267 '[' : rest ->
1268 case parseQuasiQuote rest of
1269 Just (quoter, body, _) ->
1270 let raw = "[" <> quoter <> "|" <> body <> "|]"
1271 st' = advanceChars raw st
1272 in Just (mkToken st st' (T.pack raw) (TkQuasiQuote (T.pack quoter) (T.pack body)), st')
1273 Nothing -> Nothing
1274 _ -> Nothing
1275 where
1276 parseQuasiQuote chars =
1277 let (quoter, rest0) = takeQuoter chars
1278 in case rest0 of
1279 '|' : rest1
1280 | not (null quoter) ->
1281 let (body, rest2) = breakOnMarker "|]" rest1
1282 in case rest2 of
1283 '|' : ']' : _ -> Just (quoter, body, take (length body + 2) rest1)
1284 _ -> Nothing
1285 _ -> Nothing
1286
1287 lexErrorToken :: LexerState -> Text -> (LexToken, LexerState)
1288 lexErrorToken st msg =
1289 let raw = take 1 (lexerInput st)
1290 rawTxt = if null raw then "<eof>" else T.pack raw
1291 st' = if null raw then st else advanceChars raw st
1292 in ( mkErrorToken st st' rawTxt msg,
1293 st'
1294 )
1295
1296 mkErrorToken :: LexerState -> LexerState -> Text -> Text -> LexToken
1297 mkErrorToken start end rawTxt msg =
1298 mkToken start end rawTxt (TkError msg)
1299
1300 tryConsumeLineDirective :: LexerState -> Maybe (Maybe LexToken, LexerState)
1301 tryConsumeLineDirective st
1302 | not (lexerAtLineStart st) = Nothing
1303 | otherwise =
1304 let (spaces, rest) = span (\c -> c == ' ' || c == '\t') (lexerInput st)
1305 in case rest of
1306 '#' : more ->
1307 let lineText = '#' : takeLineRemainder more
1308 consumed = spaces <> lineText
1309 in case parseHashLineDirective lineText of
1310 Just update ->
1311 Just (Nothing, applyDirectiveAdvance consumed update st)
1312 Nothing ->
1313 let st' = advanceChars consumed st
1314 in Just (Just (mkToken st st' (T.pack consumed) (TkError "malformed line directive")), st')
1315 _ -> Nothing
1316
1317 tryConsumeControlPragma :: LexerState -> Maybe (Maybe LexToken, LexerState)
1318 tryConsumeControlPragma st =
1319 case parseControlPragma (lexerInput st) of
1320 Just (consumed0, Right update0) ->
1321 let (consumed, update) =
1322 case directiveLine update0 of
1323 Just lineNo ->
1324 case drop (length consumed0) (lexerInput st) of
1325 '\n' : _ ->
1326 (consumed0 <> "\n", update0 {directiveLine = Just lineNo, directiveCol = Just 1})
1327 _ -> (consumed0, update0)
1328 Nothing -> (consumed0, update0)
1329 in Just (Nothing, applyDirectiveAdvance consumed update st)
1330 Just (consumed, Left msg) ->
1331 let st' = advanceChars consumed st
1332 in Just (Just (mkToken st st' (T.pack consumed) (TkError msg)), st')
1333 Nothing -> Nothing
1334
1335 applyDirectiveAdvance :: String -> DirectiveUpdate -> LexerState -> LexerState
1336 applyDirectiveAdvance consumed update st =
1337 let hasTrailingNewline =
1338 case reverse consumed of
1339 '\n' : _ -> True
1340 _ -> False
1341 in st
1342 { lexerInput = drop (length consumed) (lexerInput st),
1343 lexerLine = maybe (lexerLine st) (max 1) (directiveLine update),
1344 lexerCol = maybe (lexerCol st) (max 1) (directiveCol update),
1345 lexerAtLineStart = hasTrailingNewline || (Just 1 == directiveCol update)
1346 }
1347
1348 consumeLineComment :: LexerState -> LexerState
1349 consumeLineComment st = advanceChars consumed st
1350 where
1351 consumed = takeCommentRemainder (drop 2 (lexerInput st))
1352 prefix = "--"
1353 takeCommentRemainder xs =
1354 prefix <> takeWhile (/= '\n') xs
1355
1356 consumeUnknownPragma :: LexerState -> Maybe LexerState
1357 consumeUnknownPragma st =
1358 case breakOnMarker "#-}" (lexerInput st) of
1359 (_, "") -> Nothing
1360 (body, marker) ->
1361 let consumed = body <> take 3 marker
1362 in Just (advanceChars consumed st)
1363
1364 consumeBlockComment :: LexerState -> Maybe LexerState
1365 consumeBlockComment st =
1366 case scanNestedBlockComment 1 (drop 2 (lexerInput st)) of
1367 Just consumedTail -> Just (advanceChars ("{-" <> consumedTail) st)
1368 Nothing -> Nothing
1369
1370 consumeBlockCommentOrError :: LexerState -> LexerState
1371 consumeBlockCommentOrError st =
1372 case consumeBlockComment st of
1373 Just st' -> st'
1374 Nothing ->
1375 let consumed = lexerInput st
1376 st' = advanceChars consumed st
1377 tok = mkToken st st' (T.pack consumed) (TkError "unterminated block comment")
1378 in st' {lexerPending = lexerPending st' <> [tok]}
1379
1380 scanNestedBlockComment :: Int -> String -> Maybe String
1381 scanNestedBlockComment depth chars
1382 | depth <= 0 = Just ""
1383 | otherwise =
1384 case chars of
1385 [] -> Nothing
1386 '{' : '-' : rest -> ("{-" <>) <$> scanNestedBlockComment (depth + 1) rest
1387 '-' : '}' : rest ->
1388 if depth == 1
1389 then Just "-}"
1390 else ("-}" <>) <$> scanNestedBlockComment (depth - 1) rest
1391 c : rest -> (c :) <$> scanNestedBlockComment depth rest
1392
1393 tryConsumeKnownPragma :: LexerState -> Maybe ()
1394 tryConsumeKnownPragma st =
1395 case lexKnownPragma st of
1396 Just _ -> Just ()
1397 Nothing -> Nothing
1398
1399 parseLanguagePragma :: String -> Maybe (Int, (Text, LexTokenKind))
1400 parseLanguagePragma input = do
1401 (_, body, consumed) <- stripNamedPragma ["LANGUAGE"] input
1402 let names = parseLanguagePragmaNames (T.pack body)
1403 raw = "{-# LANGUAGE " <> T.unpack (T.intercalate ", " (map extensionSettingName names)) <> " #-}"
1404 pure (length consumed, (T.pack raw, TkPragmaLanguage names))
1405
1406 parseOptionsPragma :: String -> Maybe (Int, (Text, LexTokenKind))
1407 parseOptionsPragma input = do
1408 (pragmaName, body, consumed) <- stripNamedPragma ["OPTIONS_GHC", "OPTIONS"] input
1409 let settings = parseOptionsPragmaSettings (T.pack body)
1410 raw = "{-# " <> pragmaName <> " " <> T.unpack (T.strip (T.pack body)) <> " #-}"
1411 pure (length consumed, (T.pack raw, TkPragmaLanguage settings))
1412
1413 parseWarningPragma :: String -> Maybe (Int, (Text, LexTokenKind))
1414 parseWarningPragma input = do
1415 (_, body, consumed) <- stripNamedPragma ["WARNING"] input
1416 let txt = T.strip (T.pack body)
1417 (msg, rawMsg) =
1418 case body of
1419 '"' : _ ->
1420 case reads body of
1421 [(decoded, "")] -> (T.pack decoded, T.pack body)
1422 _ -> (txt, txt)
1423 _ -> (txt, txt)
1424 raw = "{-# WARNING " <> rawMsg <> " #-}"
1425 pure (length consumed, (raw, TkPragmaWarning msg))
1426
1427 parseDeprecatedPragma :: String -> Maybe (Int, (Text, LexTokenKind))
1428 parseDeprecatedPragma input = do
1429 (_, body, consumed) <- stripNamedPragma ["DEPRECATED"] input
1430 let txt = T.strip (T.pack body)
1431 (msg, rawMsg) =
1432 case body of
1433 '"' : _ ->
1434 case reads body of
1435 [(decoded, "")] -> (T.pack decoded, T.pack body)
1436 _ -> (txt, txt)
1437 _ -> (txt, txt)
1438 raw = "{-# DEPRECATED " <> rawMsg <> " #-}"
1439 pure (length consumed, (raw, TkPragmaDeprecated msg))
1440
1441 stripPragma :: String -> String -> Maybe String
1442 stripPragma name input = (\(_, body, _) -> body) <$> stripNamedPragma [name] input
1443
1444 stripNamedPragma :: [String] -> String -> Maybe (String, String, String)
1445 stripNamedPragma names input = do
1446 rest0 <- List.stripPrefix "{-#" input
1447 let rest1 = dropWhile isSpace rest0
1448 name <- List.find (`List.isPrefixOf` rest1) names
1449 rest2 <- List.stripPrefix name rest1
1450 let rest3 = dropWhile isSpace rest2
1451 (body, marker) = breakOnMarker "#-}" rest3
1452 guardPrefix "#-}" marker
1453 let consumedLen = length input - length (drop 3 marker)
1454 pure (name, trimRight body, take consumedLen input)
1455
1456 parseLanguagePragmaNames :: Text -> [ExtensionSetting]
1457 parseLanguagePragmaNames body =
1458 mapMaybe (parseExtensionSettingName . T.strip . T.takeWhile (/= '#')) (T.splitOn "," body)
1459
1460 parseOptionsPragmaSettings :: Text -> [ExtensionSetting]
1461 parseOptionsPragmaSettings body = go (pragmaWords body)
1462 where
1463 go ws =
1464 case ws of
1465 [] -> []
1466 "-cpp" : rest -> EnableExtension CPP : go rest
1467 "-fffi" : rest -> EnableExtension ForeignFunctionInterface : go rest
1468 "-fglasgow-exts" : rest -> glasgowExtsSettings <> go rest
1469 opt : rest
1470 | Just ext <- T.stripPrefix "-X" opt,
1471 not (T.null ext) ->
1472 case parseExtensionSettingName ext of
1473 Just setting -> setting : go rest
1474 Nothing -> go rest
1475 _ : rest -> go rest
1476
1477 glasgowExtsSettings :: [ExtensionSetting]
1478 glasgowExtsSettings =
1479 map
1480 EnableExtension
1481 [ ConstrainedClassMethods,
1482 DeriveDataTypeable,
1483 DeriveFoldable,
1484 DeriveFunctor,
1485 DeriveGeneric,
1486 DeriveTraversable,
1487 EmptyDataDecls,
1488 ExistentialQuantification,
1489 ExplicitNamespaces,
1490 FlexibleContexts,
1491 FlexibleInstances,
1492 ForeignFunctionInterface,
1493 FunctionalDependencies,
1494 GeneralizedNewtypeDeriving,
1495 ImplicitParams,
1496 InterruptibleFFI,
1497 KindSignatures,
1498 LiberalTypeSynonyms,
1499 MagicHash,
1500 MultiParamTypeClasses,
1501 ParallelListComp,
1502 PatternGuards,
1503 PostfixOperators,
1504 RankNTypes,
1505 RecursiveDo,
1506 ScopedTypeVariables,
1507 StandaloneDeriving,
1508 TypeOperators,
1509 TypeSynonymInstances,
1510 UnboxedTuples,
1511 UnicodeSyntax,
1512 UnliftedFFITypes
1513 ]
1514
1515 pragmaWords :: Text -> [Text]
1516 pragmaWords txt = go [] [] Nothing (T.unpack txt)
1517 where
1518 go acc current quote chars =
1519 case chars of
1520 [] ->
1521 let acc' = pushCurrent acc current
1522 in reverse acc'
1523 c : rest ->
1524 case quote of
1525 Just q
1526 | c == q -> go acc current Nothing rest
1527 | c == '\\' ->
1528 case rest of
1529 escaped : rest' -> go acc (escaped : current) quote rest'
1530 [] -> go acc current quote []
1531 | otherwise -> go acc (c : current) quote rest
1532 Nothing
1533 | c == '"' || c == '\'' -> go acc current (Just c) rest
1534 | c == '\\' ->
1535 case rest of
1536 escaped : rest' -> go acc (escaped : current) Nothing rest'
1537 [] -> go acc current Nothing []
1538 | c `elem` [' ', '\n', '\r', '\t'] ->
1539 let acc' = pushCurrent acc current
1540 in go acc' [] Nothing rest
1541 | otherwise -> go acc (c : current) Nothing rest
1542
1543 pushCurrent acc current =
1544 case reverse current of
1545 [] -> acc
1546 token -> T.pack token : acc
1547
1548 parseHashLineDirective :: String -> Maybe DirectiveUpdate
1549 parseHashLineDirective raw =
1550 let trimmed = trimLeft (drop 1 (trimLeft raw))
1551 trimmed' =
1552 if "line" `List.isPrefixOf` trimmed
1553 then dropWhile isSpace (drop 4 trimmed)
1554 else trimmed
1555 (digits, _) = span isDigit trimmed'
1556 in if null digits
1557 then Nothing
1558 else Just DirectiveUpdate {directiveLine = Just (read digits), directiveCol = Just 1}
1559
1560 parseControlPragma :: String -> Maybe (String, Either Text DirectiveUpdate)
1561 parseControlPragma input
1562 | Just body <- stripPragma "LINE" input =
1563 let trimmed = words body
1564 in case trimmed of
1565 lineNo : _
1566 | all isDigit lineNo ->
1567 Just
1568 ( fullPragmaConsumed "LINE" body,
1569 Right DirectiveUpdate {directiveLine = Just (read lineNo), directiveCol = Just 1}
1570 )
1571 _ -> Just (fullPragmaConsumed "LINE" body, Left "malformed LINE pragma")
1572 | Just body <- stripPragma "COLUMN" input =
1573 let trimmed = words body
1574 in case trimmed of
1575 colNo : _
1576 | all isDigit colNo ->
1577 Just
1578 ( fullPragmaConsumed "COLUMN" body,
1579 Right DirectiveUpdate {directiveLine = Nothing, directiveCol = Just (read colNo)}
1580 )
1581 _ -> Just (fullPragmaConsumed "COLUMN" body, Left "malformed COLUMN pragma")
1582 | otherwise = Nothing
1583
1584 fullPragmaConsumed :: String -> String -> String
1585 fullPragmaConsumed name body = "{-# " <> name <> " " <> trimRight body <> " #-}"
1586
1587 mkToken :: LexerState -> LexerState -> Text -> LexTokenKind -> LexToken
1588 mkToken start end tokTxt kind =
1589 LexToken
1590 { lexTokenKind = kind,
1591 lexTokenText = tokTxt,
1592 lexTokenSpan = mkSpan start end
1593 }
1594
1595 mkSpan :: LexerState -> LexerState -> SourceSpan
1596 mkSpan start end =
1597 SourceSpan
1598 { sourceSpanStartLine = lexerLine start,
1599 sourceSpanStartCol = lexerCol start,
1600 sourceSpanEndLine = lexerLine end,
1601 sourceSpanEndCol = lexerCol end
1602 }
1603
1604 advanceChars :: String -> LexerState -> LexerState
1605 advanceChars chars st = foldl advanceOne st chars
1606 where
1607 advanceOne acc ch =
1608 case ch of
1609 '\n' ->
1610 acc
1611 { lexerInput = drop 1 (lexerInput acc),
1612 lexerLine = lexerLine acc + 1,
1613 lexerCol = 1,
1614 lexerAtLineStart = True
1615 }
1616 _ ->
1617 acc
1618 { lexerInput = drop 1 (lexerInput acc),
1619 lexerCol = lexerCol acc + 1,
1620 lexerAtLineStart = False
1621 }
1622
1623 consumeWhile :: (Char -> Bool) -> LexerState -> LexerState
1624 consumeWhile f st = advanceChars (takeWhile f (lexerInput st)) st
1625
1626 -- | Take digits with optional underscores.
1627 --
1628 -- When @allowUnderscores@ is True (NumericUnderscores enabled):
1629 -- - Underscores may appear between digits, including consecutive underscores
1630 -- - Leading underscores are NOT allowed (the first character must be a digit)
1631 -- - Trailing underscores cause lexing to stop (they're not consumed)
1632 --
1633 -- When @allowUnderscores@ is False:
1634 -- - No underscores are accepted; only digits are consumed
1635 takeDigitsWithUnderscores :: Bool -> (Char -> Bool) -> String -> (String, String)
1636 takeDigitsWithUnderscores allowUnderscores isDigitChar chars =
1637 let (firstChunk, rest) = span isDigitChar chars
1638 in if null firstChunk
1639 then ("", chars)
1640 else
1641 if allowUnderscores
1642 then go firstChunk rest
1643 else (firstChunk, rest)
1644 where
1645 go acc xs =
1646 case xs of
1647 '_' : rest ->
1648 -- Consume consecutive underscores
1649 let (underscores, rest') = span (== '_') rest
1650 allUnderscores = '_' : underscores
1651 (chunk, rest'') = span isDigitChar rest'
1652 in if null chunk
1653 then (acc, xs) -- Trailing underscore(s), stop here
1654 else go (acc <> allUnderscores <> chunk) rest''
1655 _ -> (acc, xs)
1656
1657 -- | Take digits with optional leading underscores after a base prefix.
1658 --
1659 -- When @allowUnderscores@ is True (NumericUnderscores enabled):
1660 -- - Leading underscores are allowed (e.g., 0x_ff, 0x__ff)
1661 -- - Underscores may appear between digits, including consecutive underscores
1662 -- - Trailing underscores cause lexing to stop
1663 --
1664 -- When @allowUnderscores@ is False:
1665 -- - No underscores are accepted; only digits are consumed
1666 takeDigitsWithLeadingUnderscores :: Bool -> (Char -> Bool) -> String -> (String, String)
1667 takeDigitsWithLeadingUnderscores allowUnderscores isDigitChar chars
1668 | not allowUnderscores =
1669 let (digits, rest) = span isDigitChar chars
1670 in (digits, rest)
1671 | otherwise =
1672 -- With NumericUnderscores, leading underscores are allowed
1673 let (leadingUnderscores, rest0) = span (== '_') chars
1674 (firstChunk, rest1) = span isDigitChar rest0
1675 in if null firstChunk
1676 then ("", chars) -- Must have at least one digit somewhere
1677 else go (leadingUnderscores <> firstChunk) rest1
1678 where
1679 go acc xs =
1680 case xs of
1681 '_' : rest ->
1682 let (underscores, rest') = span (== '_') rest
1683 allUnderscores = '_' : underscores
1684 (chunk, rest'') = span isDigitChar rest'
1685 in if null chunk
1686 then (acc, xs)
1687 else go (acc <> allUnderscores <> chunk) rest''
1688 _ -> (acc, xs)
1689
1690 -- | Parse the exponent part of a float literal (e.g., "e+10", "E-5").
1691 --
1692 -- When @allowUnderscores@ is True (NumericUnderscores enabled):
1693 -- - Underscores may appear before the exponent marker (e.g., "1_e+23")
1694 -- - This is handled by consuming trailing underscores from the mantissa into the exponent
1695 -- - Underscores may also appear within the exponent digits
1696 --
1697 -- When @allowUnderscores@ is False:
1698 -- - Only digits are accepted in the exponent
1699 takeExponent :: Bool -> String -> (String, String)
1700 takeExponent allowUnderscores chars =
1701 case chars of
1702 -- Handle leading underscores before 'e'/'E' when NumericUnderscores enabled
1703 '_' : rest
1704 | allowUnderscores ->
1705 let (underscores, rest') = span (== '_') rest
1706 allUnderscores = '_' : underscores
1707 in case rest' of
1708 marker : rest2
1709 | marker `elem` ("eE" :: String) ->
1710 let (signPart, rest3) =
1711 case rest2 of
1712 sign : more | sign `elem` ("+-" :: String) -> ([sign], more)
1713 _ -> ("", rest2)
1714 (digits, rest4) = takeDigitsWithUnderscores allowUnderscores isDigit rest3
1715 in if null digits
1716 then ("", chars)
1717 else (allUnderscores <> [marker] <> signPart <> digits, rest4)
1718 _ -> ("", chars)
1719 marker : rest
1720 | marker `elem` ("eE" :: String) ->
1721 let (signPart, rest1) =
1722 case rest of
1723 sign : more | sign `elem` ("+-" :: String) -> ([sign], more)
1724 _ -> ("", rest)
1725 (digits, rest2) = takeDigitsWithUnderscores allowUnderscores isDigit rest1
1726 in if null digits then ("", chars) else (marker : signPart <> digits, rest2)
1727 _ -> ("", chars)
1728
1729 takeHexExponent :: String -> Maybe String
1730 takeHexExponent chars =
1731 case chars of
1732 marker : rest
1733 | marker `elem` ("pP" :: String) ->
1734 let (signPart, rest1) =
1735 case rest of
1736 sign : more | sign `elem` ("+-" :: String) -> ([sign], more)
1737 _ -> ("", rest)
1738 (digits, _) = span isDigit rest1
1739 in if null digits then Nothing else Just (marker : signPart <> digits)
1740 _ -> Nothing
1741
1742 scanQuoted :: Char -> String -> Either String (String, String)
1743 scanQuoted endCh = go []
1744 where
1745 go acc chars =
1746 case chars of
1747 [] -> Left (reverse acc)
1748 c : rest
1749 | c == endCh -> Right (reverse acc, rest)
1750 | c == '\\' ->
1751 case rest of
1752 escaped : rest' -> go (escaped : c : acc) rest'
1753 [] -> Left (reverse (c : acc))
1754 | otherwise -> go (c : acc) rest
1755
1756 takeQuoter :: String -> (String, String)
1757 takeQuoter chars =
1758 case chars of
1759 c : rest
1760 | isIdentStart c ->
1761 let (tailChars, rest0) = span isIdentTailOrStart rest
1762 in go (c : tailChars) rest0
1763 _ -> ("", chars)
1764 where
1765 go acc chars' =
1766 case chars' of
1767 '.' : c : rest
1768 | isIdentStart c ->
1769 let (tailChars, rest0) = span isIdentTailOrStart rest
1770 in go (acc <> "." <> [c] <> tailChars) rest0
1771 _ -> (acc, chars')
1772
1773 breakOnMarker :: String -> String -> (String, String)
1774 breakOnMarker marker = go []
1775 where
1776 go acc chars
1777 | marker `List.isPrefixOf` chars = (reverse acc, chars)
1778 | otherwise =
1779 case chars of
1780 [] -> (reverse acc, [])
1781 c : rest -> go (c : acc) rest
1782
1783 takeLineRemainder :: String -> String
1784 takeLineRemainder chars =
1785 let (prefix, rest) = break (== '\n') chars
1786 in prefix <> take 1 rest
1787
1788 trimLeft :: String -> String
1789 trimLeft = dropWhile isSpace
1790
1791 trimRight :: String -> String
1792 trimRight = reverse . dropWhile isSpace . reverse
1793
1794 guardPrefix :: (Eq a) => [a] -> [a] -> Maybe ()
1795 guardPrefix prefix actual =
1796 if prefix `List.isPrefixOf` actual
1797 then Just ()
1798 else Nothing
1799
1800 readMaybeChar :: String -> Maybe Char
1801 readMaybeChar raw =
1802 case reads raw of
1803 [(c, "")] -> Just c
1804 _ -> Nothing
1805
1806 readHexLiteral :: Text -> Integer
1807 readHexLiteral txt =
1808 case readHex (T.unpack (T.filter (/= '_') (T.drop 2 txt))) of
1809 [(n, "")] -> n
1810 _ -> 0
1811
1812 readOctLiteral :: Text -> Integer
1813 readOctLiteral txt =
1814 case readOct (T.unpack (T.filter (/= '_') (T.drop 2 txt))) of
1815 [(n, "")] -> n
1816 _ -> 0
1817
1818 readBinLiteral :: Text -> Integer
1819 readBinLiteral txt =
1820 case readInt 2 (`elem` ("01" :: String)) digitToInt (T.unpack (T.filter (/= '_') (T.drop 2 txt))) of
1821 [(n, "")] -> n
1822 _ -> 0
1823
1824 parseHexFloatLiteral :: String -> String -> String -> Double
1825 parseHexFloatLiteral intDigits fracDigits expo =
1826 (parseHexDigits intDigits + parseHexFraction fracDigits) * (2 ^^ exponentValue expo)
1827
1828 parseHexDigits :: String -> Double
1829 parseHexDigits = foldl (\acc d -> acc * 16 + fromIntegral (digitToInt d)) 0
1830
1831 parseHexFraction :: String -> Double
1832 parseHexFraction ds =
1833 sum [fromIntegral (digitToInt d) / (16 ^^ i) | (d, i) <- zip ds [1 :: Int ..]]
1834
1835 exponentValue :: String -> Int
1836 exponentValue expo =
1837 case expo of
1838 _ : '-' : ds -> negate (read ds)
1839 _ : '+' : ds -> read ds
1840 _ : ds -> read ds
1841 _ -> 0
1842
1843 isIdentStart :: Char -> Bool
1844 isIdentStart c = isAsciiUpper c || isAsciiLower c || c == '_'
1845
1846 isIdentTail :: Char -> Bool
1847 isIdentTail c = isAlphaNum c || c == '_' || c == '\''
1848
1849 isSymbolicOpChar :: Char -> Bool
1850 isSymbolicOpChar c = c `elem` (":!#$%&*+./<=>?@\\^|-~" :: String) || isUnicodeSymbol c
1851
1852 -- | Unicode symbols that may be used with UnicodeSyntax extension.
1853 -- These are: ∷ ⇒ → ← ∀ ★ ⤙ ⤚ ⤛ ⤜ ⦇ ⦈ ⟦ ⟧ ⊸
1854 isUnicodeSymbol :: Char -> Bool
1855 isUnicodeSymbol c =
1856 c == '∷' -- U+2237 PROPORTION (for ::)
1857 || c == '⇒' -- U+21D2 RIGHTWARDS DOUBLE ARROW (for =>)
1858 || c == '→' -- U+2192 RIGHTWARDS ARROW (for ->)
1859 || c == '←' -- U+2190 LEFTWARDS ARROW (for <-)
1860 || c == '∀' -- U+2200 FOR ALL (for forall)
1861 || c == '★' -- U+2605 BLACK STAR (for *)
1862 || c == '⤙' -- U+2919 LEFTWARDS ARROW-TAIL (for -<)
1863 || c == '⤚' -- U+291A RIGHTWARDS ARROW-TAIL (for >-)
1864 || c == '⤛' -- U+291B LEFTWARDS DOUBLE ARROW-TAIL (for -<<)
1865 || c == '⤜' -- U+291C RIGHTWARDS DOUBLE ARROW-TAIL (for >>-)
1866 || c == '⦇' -- U+2987 Z NOTATION LEFT IMAGE BRACKET (for (|)
1867 || c == '⦈' -- U+2988 Z NOTATION RIGHT IMAGE BRACKET (for |))
1868 || c == '⟦' -- U+27E6 MATHEMATICAL LEFT WHITE SQUARE BRACKET (for [|)
1869 || c == '⟧' -- U+27E7 MATHEMATICAL RIGHT WHITE SQUARE BRACKET (for |])
1870 || c == '⊸' -- U+22B8 MULTIMAP (for %1-> with LinearTypes)
1871
1872 -- | Check if the remainder after '--' should start a line comment.
1873 -- Per Haskell Report: '--' starts a comment only if the entire symbol sequence
1874 -- consists solely of dashes, or is not followed by any symbol character.
1875 -- E.g., '-- foo' is a comment, '---' is a comment, but '-->' is an operator.
1876 isLineComment :: String -> Bool
1877 isLineComment rest =
1878 case rest of
1879 [] -> True -- Just '--' followed by nothing or whitespace
1880 c : _
1881 | c == '-' -> isLineComment (dropWhile (== '-') rest) -- More dashes, keep checking
1882 | isSymbolicOpChar c -> False -- Non-dash symbol char means it's an operator
1883 | otherwise -> True -- Non-symbol char means comment
1884
1885 isIdentTailOrStart :: Char -> Bool
1886 isIdentTailOrStart c = isAlphaNum c || c == '_' || c == '\''
1887
1888 isReservedIdentifier :: Text -> Bool
1889 isReservedIdentifier = isJust . keywordTokenKind
1890
1891 keywordTokenKind :: Text -> Maybe LexTokenKind
1892 keywordTokenKind txt = case txt of
1893 "case" -> Just TkKeywordCase
1894 "class" -> Just TkKeywordClass
1895 "data" -> Just TkKeywordData
1896 "default" -> Just TkKeywordDefault
1897 "deriving" -> Just TkKeywordDeriving
1898 "do" -> Just TkKeywordDo
1899 "else" -> Just TkKeywordElse
1900 "foreign" -> Just TkKeywordForeign
1901 "if" -> Just TkKeywordIf
1902 "import" -> Just TkKeywordImport
1903 "in" -> Just TkKeywordIn
1904 "infix" -> Just TkKeywordInfix
1905 "infixl" -> Just TkKeywordInfixl
1906 "infixr" -> Just TkKeywordInfixr
1907 "instance" -> Just TkKeywordInstance
1908 "let" -> Just TkKeywordLet
1909 "module" -> Just TkKeywordModule
1910 "newtype" -> Just TkKeywordNewtype
1911 "of" -> Just TkKeywordOf
1912 "then" -> Just TkKeywordThen
1913 "type" -> Just TkKeywordType
1914 "where" -> Just TkKeywordWhere
1915 "_" -> Just TkKeywordUnderscore
1916 -- Context-sensitive keywords (not strictly reserved per Report)
1917 "qualified" -> Just TkKeywordQualified
1918 "as" -> Just TkKeywordAs
1919 "hiding" -> Just TkKeywordHiding
1920 _ -> Nothing
1921
1922 -- | Classify reserved operators per Haskell Report Section 2.4.
1923 reservedOpTokenKind :: Text -> Maybe LexTokenKind
1924 reservedOpTokenKind txt = case txt of
1925 ".." -> Just TkReservedDotDot
1926 ":" -> Just TkReservedColon
1927 "::" -> Just TkReservedDoubleColon
1928 "=" -> Just TkReservedEquals
1929 "\\" -> Just TkReservedBackslash
1930 "|" -> Just TkReservedPipe
1931 "<-" -> Just TkReservedLeftArrow
1932 "->" -> Just TkReservedRightArrow
1933 "@" -> Just TkReservedAt
1934 -- Note: ~ is NOT reserved; it uses whitespace-sensitive lexing (GHC proposal 0229)
1935 "=>" -> Just TkReservedDoubleArrow
1936 _ -> Nothing