Safe Haskell	None
Language	GHC2021

Aihc.Parser.Lex

Description

This module performs the pre-parse tokenization step for Haskell source code. It turns raw text into LexTokens that preserve:

a semantic token classification (LexTokenKind)
the original token text (lexTokenText)
source location information (lexTokenSpan)

The lexer runs in two phases:

Raw tokenization with a custom incremental scanner that consumes one or more input chunks and emits tokens lazily. Extension-specific lexing (such as NegativeLiterals and LexicalNegation) is handled inline during this phase by tracking the previous token context.
Layout insertion (applyLayoutTokens) that inserts virtual {, ; and } according to indentation (the offside rule), so the parser can treat implicit layout like explicit braces and semicolons.

Scanning is incremental and error-tolerant:

token production starts as soon as enough input is available
malformed lexemes produce TkError tokens instead of aborting lexing
# ..., #line ..., {-# LINE #-} and {-# COLUMN #-} are handled in-band by the lexer and update subsequent token spans without being exposed as normal tokens

Layout-sensitive syntax is the tricky part. The implementation tracks a stack of layout contexts and mirrors the haskell-src-exts model summarized in docs/hse-indentation-layout.md:

after layout-introducing keywords (currently do, of, let, where, \case, plus optional module body layout), mark a pending implicit block
if the next token is an explicit {, disable implicit insertion for that block
otherwise, open an implicit layout context at the next token column
at beginning-of-line tokens, dedent emits virtual }, equal-indent emits virtual ; (with a small suppression rule for then/else)

Keyword classification is intentionally lexical and exact. lexIdentifier produces a keyword token only when the full identifier text exactly matches a reserved word in keywordTokenKind. That means:

where becomes TkKeywordWhere
where', _where, and M.where remain identifiers

In other words, use keyword tokens only for exact reserved lexemes; contextual validity is left to the parser.

Synopsis

data LexToken = LexToken {
- lexTokenKind :: !LexTokenKind
- lexTokenText :: !Text
- lexTokenSpan :: !SourceSpan
}
data LexTokenKind
- = TkKeywordCase
- | TkKeywordClass
- | TkKeywordData
- | TkKeywordDefault
- | TkKeywordDeriving
- | TkKeywordDo
- | TkKeywordElse
- | TkKeywordForeign
- | TkKeywordIf
- | TkKeywordImport
- | TkKeywordIn
- | TkKeywordInfix
- | TkKeywordInfixl
- | TkKeywordInfixr
- | TkKeywordInstance
- | TkKeywordLet
- | TkKeywordModule
- | TkKeywordNewtype
- | TkKeywordOf
- | TkKeywordThen
- | TkKeywordType
- | TkKeywordWhere
- | TkKeywordUnderscore
- | TkKeywordQualified
- | TkKeywordAs
- | TkKeywordHiding
- | TkReservedDotDot
- | TkReservedColon
- | TkReservedDoubleColon
- | TkReservedEquals
- | TkReservedBackslash
- | TkReservedPipe
- | TkReservedLeftArrow
- | TkReservedRightArrow
- | TkReservedAt
- | TkReservedDoubleArrow
- | TkVarId Text
- | TkConId Text
- | TkQVarId Text
- | TkQConId Text
- | TkVarSym Text
- | TkConSym Text
- | TkQVarSym Text
- | TkQConSym Text
- | TkInteger Integer
- | TkIntegerBase Integer Text
- | TkFloat Double Text
- | TkChar Char
- | TkString Text
- | TkSpecialLParen
- | TkSpecialRParen
- | TkSpecialComma
- | TkSpecialSemicolon
- | TkSpecialLBracket
- | TkSpecialRBracket
- | TkSpecialBacktick
- | TkSpecialLBrace
- | TkSpecialRBrace
- | TkMinusOperator
- | TkPrefixMinus
- | TkPrefixBang
- | TkPrefixTilde
- | TkPragmaLanguage [ExtensionSetting]
- | TkPragmaWarning Text
- | TkPragmaDeprecated Text
- | TkQuasiQuote Text Text
- | TkError Text
isReservedIdentifier :: Text -> Bool
readModuleHeaderExtensions :: Text -> [ExtensionSetting]
readModuleHeaderExtensionsFromChunks :: [Text] -> [ExtensionSetting]
lexTokensFromChunks :: [Text] -> [LexToken]
lexModuleTokensFromChunks :: [Extension] -> [Text] -> [LexToken]
lexTokensWithExtensions :: [Extension] -> Text -> [LexToken]
lexModuleTokensWithExtensions :: [Extension] -> Text -> [LexToken]
lexTokens :: Text -> [LexToken]
lexModuleTokens :: Text -> [LexToken]

Documentation

data LexToken Source #

Constructors

LexToken
Fields lexTokenKind :: !LexTokenKind lexTokenText :: !Text lexTokenSpan :: !SourceSpan

Instances

Instances details

Shorthand LexToken Source #

Instance details

Defined in Aihc.Parser.Shorthand

Methods

shorthand :: LexToken -> Doc () Source #

NFData LexToken Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

rnf :: LexToken -> () #

Generic LexToken Source #

Instance details

Defined in Aihc.Parser.Lex

Associated Types

type Rep LexToken

Instance details

Defined in Aihc.Parser.Lex

type Rep LexToken = D1 ('MetaData "LexToken" "Aihc.Parser.Lex" "aihc-parser-0.1.0.0-DMgbIAjzuEdJKCHQvjmdks" 'False) (C1 ('MetaCons "LexToken" 'PrefixI 'True) (S1 ('MetaSel ('Just "lexTokenKind") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 LexTokenKind) :*: (S1 ('MetaSel ('Just "lexTokenText") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 Text) :*: S1 ('MetaSel ('Just "lexTokenSpan") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 SourceSpan))))

Methods

from :: LexToken -> Rep LexToken x #

to :: Rep LexToken x -> LexToken #

Show LexToken Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

showsPrec :: Int -> LexToken -> ShowS #

show :: LexToken -> String #

showList :: [LexToken] -> ShowS #

Eq LexToken Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

(==) :: LexToken -> LexToken -> Bool #

(/=) :: LexToken -> LexToken -> Bool #

Ord LexToken Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

compare :: LexToken -> LexToken -> Ordering #

(<) :: LexToken -> LexToken -> Bool #

(<=) :: LexToken -> LexToken -> Bool #

(>) :: LexToken -> LexToken -> Bool #

(>=) :: LexToken -> LexToken -> Bool #

max :: LexToken -> LexToken -> LexToken #

min :: LexToken -> LexToken -> LexToken #

type Rep LexToken Source #

Instance details

Defined in Aihc.Parser.Lex

type Rep LexToken = D1 ('MetaData "LexToken" "Aihc.Parser.Lex" "aihc-parser-0.1.0.0-DMgbIAjzuEdJKCHQvjmdks" 'False) (C1 ('MetaCons "LexToken" 'PrefixI 'True) (S1 ('MetaSel ('Just "lexTokenKind") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 LexTokenKind) :*: (S1 ('MetaSel ('Just "lexTokenText") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 Text) :*: S1 ('MetaSel ('Just "lexTokenSpan") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 SourceSpan))))

data LexTokenKind Source #

Constructors

TkKeywordCase
TkKeywordClass
TkKeywordData
TkKeywordDefault
TkKeywordDeriving
TkKeywordDo
TkKeywordElse
TkKeywordForeign
TkKeywordIf
TkKeywordImport
TkKeywordIn
TkKeywordInfix
TkKeywordInfixl
TkKeywordInfixr
TkKeywordInstance
TkKeywordLet
TkKeywordModule
TkKeywordNewtype
TkKeywordOf
TkKeywordThen
TkKeywordType
TkKeywordWhere
TkKeywordUnderscore
TkKeywordQualified
TkKeywordAs
TkKeywordHiding
TkReservedDotDot
TkReservedColon
TkReservedDoubleColon
TkReservedEquals
TkReservedBackslash
TkReservedPipe
TkReservedLeftArrow
TkReservedRightArrow
TkReservedAt
TkReservedDoubleArrow
TkVarId Text
TkConId Text
TkQVarId Text
TkQConId Text
TkVarSym Text
TkConSym Text
TkQVarSym Text
TkQConSym Text
TkInteger Integer
TkIntegerBase Integer Text
TkFloat Double Text
TkChar Char
TkString Text
TkSpecialLParen
TkSpecialRParen
TkSpecialComma
TkSpecialSemicolon
TkSpecialLBracket
TkSpecialRBracket
TkSpecialBacktick
TkSpecialLBrace
TkSpecialRBrace
TkMinusOperator
TkPrefixMinus
TkPrefixBang
TkPrefixTilde
TkPragmaLanguage [ExtensionSetting]
TkPragmaWarning Text
TkPragmaDeprecated Text
TkQuasiQuote Text Text
TkError Text

Instances

Instances details

Shorthand LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Shorthand

Methods

shorthand :: LexTokenKind -> Doc () Source #

NFData LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

rnf :: LexTokenKind -> () #

Generic LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

Associated Types

type Rep LexTokenKind

Instance details

Defined in Aihc.Parser.Lex

Methods

from :: LexTokenKind -> Rep LexTokenKind x #

to :: Rep LexTokenKind x -> LexTokenKind #

Read LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

readsPrec :: Int -> ReadS LexTokenKind #

readList :: ReadS [LexTokenKind] #

readPrec :: ReadPrec LexTokenKind #

readListPrec :: ReadPrec [LexTokenKind] #

Show LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

showsPrec :: Int -> LexTokenKind -> ShowS #

show :: LexTokenKind -> String #

showList :: [LexTokenKind] -> ShowS #

Eq LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

(==) :: LexTokenKind -> LexTokenKind -> Bool #

(/=) :: LexTokenKind -> LexTokenKind -> Bool #

Ord LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

Methods

compare :: LexTokenKind -> LexTokenKind -> Ordering #

(<) :: LexTokenKind -> LexTokenKind -> Bool #

(<=) :: LexTokenKind -> LexTokenKind -> Bool #

(>) :: LexTokenKind -> LexTokenKind -> Bool #

(>=) :: LexTokenKind -> LexTokenKind -> Bool #

max :: LexTokenKind -> LexTokenKind -> LexTokenKind #

min :: LexTokenKind -> LexTokenKind -> LexTokenKind #

type Rep LexTokenKind Source #

Instance details

Defined in Aihc.Parser.Lex

isReservedIdentifier :: Text -> Bool Source #

readModuleHeaderExtensions :: Text -> [ExtensionSetting] Source #

Read leading module-header pragmas and return parsed LANGUAGE settings.

This scans only the pragma/header prefix (allowing whitespace and comments) and stops at the first non-pragma token or lexer error token.

readModuleHeaderExtensionsFromChunks :: [Text] -> [ExtensionSetting] Source #

Read leading module-header pragmas from one or more input chunks.

This scans only the pragma/header prefix (allowing whitespace and comments) and stops at the first non-pragma token or lexer error token.

lexTokensFromChunks :: [Text] -> [LexToken] Source #

Lex an expression/declaration stream from one or more input chunks.

Tokens are produced lazily, so downstream consumers can begin parsing before the full source has been scanned.

lexModuleTokensFromChunks :: [Extension] -> [Text] -> [LexToken] Source #

Lex a full module from one or more input chunks with explicit extensions.

This variant enables module-body layout insertion in addition to the normal token scan and extension rewrites.

lexTokensWithExtensions :: [Extension] -> Text -> [LexToken] Source #

Lex source text using explicit lexer extensions.

This runs raw tokenization, extension rewrites, and implicit-layout insertion. Module-top layout is not enabled here. Malformed lexemes become TkError tokens in the token stream.

lexModuleTokensWithExtensions :: [Extension] -> Text -> [LexToken] Source #

Lex module source text using explicit lexer extensions.

Like lexTokensWithExtensions, but also enables top-level module-body layout: when the source omits explicit braces, virtual layout tokens are inserted after module ... where (or from the first non-pragma token in module-less files).

lexTokens :: Text -> [LexToken] Source #

Convenience lexer entrypoint: no extensions, parse as expression/declaration stream.

This variant consumes a single strict Text chunk and returns a lazy list of tokens. Lexing errors are preserved as TkError tokens instead of causing lexing to fail.

lexModuleTokens :: Text -> [LexToken] Source #

Convenience lexer entrypoint for full modules: no explicit extension list.

Leading header pragmas are scanned first so module-enabled extensions can be applied before token rewrites and top-level layout insertion.