I would like to parse a basic indented language using Megaparsec. Originally I was using Parsec which I managed to get working correctly with indentation but now I'm having quite some trouble.
I've been following a tutorial here and here's the code I have to parse a language ignoring indentation.
module Parser where
import Data.Functor ((<$>), (<$))
import Control.Applicative (Applicative(..))
import qualified Control.Monad as M
import Control.Monad (void)
import Data.Functor.Identity
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Void
import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Megaparsec.Perm
import Text.Megaparsec.Expr
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Pretty.Simple
import Data.Either.Unwrap
--import Lexer
import Syntax
type Parser = Parsec Void String
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser () -- ‘sc’ stands for “space consumer”
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
where
f x = x == ' ' || x == '\t'
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
symbol :: String -> Parser String
symbol = L.symbol sc
integer :: Parser Integer
integer = lexeme L.decimal
semi :: Parser String
semi = symbol ";"
rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)
rws :: [String] -- list of reserved words
rws = ["if","then","else","while","do","skip","true","false","not","and","or"]
identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
where
p = (:) <$> letterChar <*> many alphaNumChar
check x = if x `elem` rws
then fail $ "keyword " ++ show x ++ " cannot be an identifier"
else return x
parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")
whileParser :: Parser Stmt
whileParser = between sc eof stmt
stmt :: Parser Stmt
stmt = f <$> sepBy1 stmt' semi
where
-- if there's only one stmt return it without using ‘Seq’
f l = if length l == 1 then head l else Seq l
stmt' :: Parser Stmt
stmt' = ifStmt
<|> whileStmt
<|> skipStmt
<|> assignStmt
<|> parens stmt
ifStmt :: Parser Stmt
ifStmt = do
rword "if"
cond <- bExpr
rword "then"
stmt1 <- stmt
rword "else"
stmt2 <- stmt
return (If cond stmt1 stmt2)
whileStmt :: Parser Stmt
whileStmt = do
rword "while"
cond <- bExpr
rword "do"
stmt1 <- stmt
return (While cond stmt1)
assignStmt :: Parser Stmt
assignStmt = do
var <- identifier
void (symbol ":=")
expr <- aExpr
return (Assign var expr)
skipStmt :: Parser Stmt
skipStmt = Skip <$ rword "skip"
aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators
bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators
aOperators :: [[Operator Parser AExpr]]
aOperators =
[ [Prefix (Neg <$ symbol "-") ]
, [ InfixL (ABinary Multiply <$ symbol "*")
, InfixL (ABinary Divide <$ symbol "/") ]
, [ InfixL (ABinary Add <$ symbol "+")
, InfixL (ABinary Subtract <$ symbol "-") ]
]
bOperators :: [[Operator Parser BExpr]]
bOperators =
[ [Prefix (Not <$ rword "not") ]
, [InfixL (BBinary And <$ rword "and")
, InfixL (BBinary Or <$ rword "or") ]
]
aTerm :: Parser AExpr
aTerm = parens aExpr
<|> Var <$> identifier
<|> IntConst <$> integer
bTerm :: Parser BExpr
bTerm = parens bExpr
<|> (BoolConst True <$ rword "true")
<|> (BoolConst False <$ rword "false")
<|> rExpr
rExpr :: Parser BExpr
rExpr = do
a1 <- aExpr
op <- relation
a2 <- aExpr
return (RBinary op a1 a2)
relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
<|> (symbol "<" *> pure Less)
parsePrint :: String -> IO()
parsePrint s = do
parseTest stmt' s
Running this parses correctly.
parsePrint $ unlines
[ "while (true) do if(false) then x := 5 else y := 20"
]
This is the code for parsing indentation from the second tutorial here.
{-# LANGUAGE TupleSections #-}
module Main where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
type Parser = Parsec Void String
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
where
f x = x == ' ' || x == '\t'
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
pItem :: Parser String
pItem = lexeme (takeWhile1P Nothing f) <?> "list item"
where
f x = isAlphaNum x || x == '-'
pComplexItem :: Parser (String, [String])
pComplexItem = L.indentBlock scn p
where
p = do
header <- pItem
return (L.IndentMany Nothing (return . (header, )) pLineFold)
pLineFold :: Parser String
pLineFold = L.lineFold scn $ \sc' ->
let ps = takeWhile1P Nothing f `sepBy1` try sc'
f x = isAlphaNum x || x == '-'
in unwords <$> ps <* sc
pItemList :: Parser (String, [(String, [String])])
pItemList = L.nonIndented scn (L.indentBlock scn p)
where
p = do
header <- pItem
return (L.IndentSome Nothing (return . (header, )) pComplexItem)
parser :: Parser (String, [(String, [String])])
parser = pItemList <* eof
main :: IO ()
main = return ()
I would like as an example for this to parse correctly.
parsePrint $ unlines
[ "while (true) do"
, " if(false) then x := 5 else y := 20"
]
How could I parse indentation correctly? Also are there any other places with tutorials/documentation on using Megaparsec?
After spending a lot of time on this over the last couple of weeks I managed to work it out. It was a matter of moving from using strings to using my own "Expr" data type.
For anybody else who would like to start writing an indented language this code could be a good start!
Parser
{-# LANGUAGE TupleSections #-}
module IndentTest where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Expr
import Block
type Parser = Parsec Void String
-- Tokens
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
where
f x = x == ' ' || x == '\t'
symbol :: String -> Parser String
symbol = L.symbol sc
rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)
rws :: [String] -- list of reserved words
rws = ["module", "println", "import", "let", "if","then","else","while","do","skip","true","false","not","and","or"]
word :: Parser String
word = (lexeme . try) (p >>= check)
where
p = (:) <$> alphaNumChar <*> many alphaNumChar
check x = if x `elem` rws
then fail $ "keyword " ++ show x ++ " cannot be an word"
else return x
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
integer :: Parser Integer
integer = lexeme L.decimal
parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")
aTerm :: Parser AExpr
aTerm = parens aExpr
<|> Var <$> identifier
<|> IntConst <$> integer
aOperators :: [[Operator Parser AExpr]]
aOperators =
[ [Prefix (Neg <$ symbol "-") ]
, [ InfixL (ABinary Multiply <$ symbol "*")
, InfixL (ABinary Divide <$ symbol "/") ]
, [ InfixL (ABinary Add <$ symbol "+")
, InfixL (ABinary Subtract <$ symbol "-") ]
]
aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators
assignArith :: Parser Expr
assignArith = do
var <- identifier
symbol ":"
vType <- valType
symbol "="
e <- aExpr
return $ AssignArith vType var e
bTerm :: Parser BExpr
bTerm = parens bExpr
<|> (BoolConst True <$ rword "true")
<|> (BoolConst False <$ rword "false")
<|> rExpr
bOperators :: [[Operator Parser BExpr]]
bOperators =
[ [Prefix (Not <$ rword "not") ]
, [InfixL (BBinary And <$ rword "and")
, InfixL (BBinary Or <$ rword "or") ]
]
bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators
rExpr :: Parser BExpr
rExpr = do
a1 <- aExpr
op <- relation
a2 <- aExpr
return (RBinary op a1 a2)
relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
<|> (symbol "<" *> pure Less)
identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
where
p = (:) <$> letterChar <*> many alphaNumChar
check x = if x `elem` rws
then fail $ "keyword " ++ show x ++ " cannot be an identifier"
else return x
stringLiteral :: Parser Expr
stringLiteral = do
value <- char '"' >> manyTill L.charLiteral (char '"')
symbol ";"
return $ StringLiteral value
assignString :: Parser Expr
assignString = do
var <- identifier
symbol ":"
vType <- valType
symbol "="
e <- stringLiteral
return (AssignString vType var e)
arrayDef :: Parser Expr
arrayDef = do
name <- identifier
symbol ":"
symbol "["
arrType <- word
symbol "]"
symbol "="
return $ ArrayDef arrType name
arrayValues :: Parser Expr
arrayValues = do
symbol "["
values <- many identifier
symbol "]"
return $ ArrayValues values
arrayAssign :: Parser Expr
arrayAssign = do
def <- arrayDef
values <- arrayValues
return $ ArrayAssignment def values
arrayElementSelect :: Parser Expr
arrayElementSelect = do
symbol "!!"
elementNum <- word
return $ ArrayElementSelect elementNum
moduleParser :: Parser Expr
moduleParser = L.nonIndented scn (L.indentBlock scn p)
where
p = do
rword "module"
name <- identifier
return (L.IndentSome Nothing (return . (Module name)) expr')
valType :: Parser Expr
valType = do
value <- identifier
return $ Type value
argumentType :: Parser Expr
argumentType = do
value <- identifier
return $ ArgumentType value
returnType :: Parser Expr
returnType = do
value <- identifier
return $ ReturnType value
argument :: Parser Expr
argument = do
value <- identifier
return $ Argument value
-- Function parser
functionParser :: Parser Expr
functionParser = L.indentBlock scn p
where
p = do
name <- identifier
symbol ":"
argTypes <- some argumentType
symbol "->"
rType <- IndentTest.returnType
nameDup <- L.lineFold scn $ \sp' ->
(identifier) `sepBy1` try sp' <* scn
args <- many argument
symbol "="
if(name == "main") then
return (L.IndentMany Nothing (return . (MainFunction name argTypes args rType)) expr')
else
return (L.IndentMany Nothing (return . (Function name argTypes args rType)) expr')
functionCallParser :: Parser Expr
functionCallParser = do
name <- identifier
args <- parens $ many argument
return $ FunctionCall name args
printParser :: Parser Expr
printParser = do
rword "println"
bodyArr <- identifier
symbol ";"
return $ Print bodyArr
valueToken :: Parser String
valueToken = lexeme (takeWhile1P Nothing f) <?> "list item"
where
f x = isAlphaNum x || x == '-'
ifStmt :: Parser Expr
ifStmt = L.indentBlock scn p
where
p = do
rword "if"
cond <- bExpr
return (L.IndentMany Nothing (return . (If cond)) expr')
elseStmt :: Parser Expr
elseStmt = L.indentBlock scn p
where
p = do
rword "else"
return (L.IndentMany Nothing (return . (Else)) expr')
whereStmt :: Parser Expr
whereStmt = do
rword "where"
symbol "{"
exprs <- many expr
symbol "}"
return $ (Where exprs)
expr :: Parser Expr
expr = f <$> sepBy1 expr' (symbol ";")
where
-- if there's only one expr return it without using ‘Seq’
f l = if length l == 1 then head l else Seq l
expr' :: Parser Expr
expr' = try moduleParser
<|> try functionParser
<|> try ifStmt
<|> try elseStmt
<|> try arrayAssign
<|> arrayElementSelect
<|> try assignArith
<|> try functionCallParser
<|> try assignString
<|> try printParser
<|> try whereStmt
<|> try stringLiteral
parser :: Parser Expr
parser = expr'
parseFromFile file = runParser expr file <$> readFile file
parseString input =
case parse expr' "" input of
Left e -> show e
Right x -> show x
parsePrint :: String -> IO()
parsePrint s = parseTest' parser s
Block/Expr - The AST consists of this
module Block where
import Data.List
import Text.Show.Functions
import Data.Char
import Data.Maybe
-- Boolean expressions
data BExpr
= BoolConst Bool
| Not BExpr
| BBinary BBinOp BExpr BExpr
| RBinary RBinOp AExpr AExpr
instance Show BExpr where
show (BoolConst b) = lowerString $ show b
show (Not n) = show n
show (BBinary bbinop bExpr1 bExpr2) = show bExpr1 ++ " " ++ show bbinop ++ " " ++ show bExpr2
show (RBinary rbinop aExpr1 aExpr2) = show aExpr1 ++ " " ++ show rbinop ++ " " ++ show aExpr2
-- Boolean ops
data BBinOp
= And
| Or
instance Show BBinOp where
show (And) = "&&"
show (Or) = "||"
-- R binary ops
data RBinOp
= Greater
| Less
instance Show RBinOp where
show (Greater) = ">"
show (Less) = "<"
-- Arithmetic expressions
data AExpr
= Var String
| IntConst Integer
| Neg AExpr
| ABinary ABinOp AExpr AExpr
| Parenthesis AExpr
instance Show AExpr where
show (Var v) = v
show (IntConst i) = show i
show (Neg aExpr) = "-" ++ show aExpr
show (ABinary aBinOp aExpr1 aExpr2) = show aExpr1 ++ " " ++ show aBinOp ++ " " ++ show aExpr2
show (Parenthesis aExpr) = "(" ++ show aExpr ++ ")"
-- Arithmetic ops
data ABinOp
= OpeningParenthesis
| ClosingParenthesis
| Add
| Subtract
| Multiply
| Divide
instance Show ABinOp where
show (Add) = "+"
show (Subtract) = "-"
show (Multiply) = "*"
show (Divide) = "/"
show (OpeningParenthesis) = "("
show (ClosingParenthesis) = ")"
-- Statements
data Expr
= Seq [Expr]
| Module String [Expr]
| Import String String
| MainFunction {name ::String, argTypes:: [Expr], args::[Expr], returnType::Expr, body::[Expr]}
| Function String [Expr] [Expr] Expr [Expr]
| FunctionCall String [Expr]
| Type String
| ValueType String
| Argument String
| ArgumentType String
| ReturnType String
| AssignArith Expr String AExpr
| AssignString Expr String Expr
| If BExpr [Expr]
| Else [Expr]
| While BExpr [Expr]
| Print String
| Return Expr
| ArrayValues [String]
| ArrayDef String String
| ArrayAssignment Expr Expr
| ArrayElementSelect String
| Lambda String String
| Where [Expr]
| StringLiteral String
| Skip
instance Show Expr where
show (Module name bodyArray) =
-- Get the main function tree
"public class " ++ name ++ "{\n" ++
"public static void main(String[] args){\n" ++
name ++ " " ++ lowerString name ++ "= new " ++ name ++ "();\n" ++
intercalate "\n" (map (\mStatement -> if(isFunctionCall mStatement) then (lowerString name ++ "." ++ show mStatement) else show mStatement) (body ((filter (isMainFunction) bodyArray)!!0))) ++
"}\n" ++
getFunctionString bodyArray ++
"}\n"
show (Import directory moduleName) = "import " ++ directory ++ moduleName
show (Function name argTypes args returnType body) = "public " ++ show returnType ++ " " ++ name ++ "("++ intercalate ", " (zipWith (\x y -> x ++ " " ++ y) (map show argTypes) (map show args)) ++"){\n" ++ intercalate "\n" (map show body) ++ "}"
show (MainFunction name argTypes args returnType body) =
intercalate "\n " $ map show body
show (FunctionCall name exprs) = name ++ "(" ++ (intercalate ", " (map show exprs)) ++ ");"
show (Type b) = b
show (Argument b) = b
show (ArgumentType b) = b
show (ReturnType b) = b
show (AssignArith vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
show (AssignString vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
show (If condition statement) = "if(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
show (Else statement) = " else {\n" ++ intercalate "\n" (map show statement) ++ "}"
show (While condition statement) = "while(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
show (Skip) = "[skip]"
show (Seq s) = "[seq]"
show (Return expr) = "return " ++ show expr ++ ";"
show (Print exprs) = "System.out.println(" ++ exprs ++ ");" --"System.out.println(" ++ intercalate " " (map show exprs) ++ ");"
show (ArrayDef arrType name) = arrType ++ "[] " ++ name ++ "="
show (ArrayValues exprs) = "{" ++ intercalate ", " exprs ++ "};"
show (ArrayAssignment arr values) = show arr ++ show values
show (ArrayElementSelect i) = "[" ++ i ++ "];"
show (Lambda valName collectionName) = ""
show (Where exprs) = intercalate "\n" (map show exprs)
show (StringLiteral value) = "\"" ++ value ++ "\""
show (_) = "<unknown>"
lowerString str = [ toLower loweredString | loweredString <- str]
extractMain :: Expr -> Maybe String
extractMain (MainFunction m _ _ _ _) = Just m
extractMain _ = Nothing
extractFunctionCall :: Expr -> Maybe String
extractFunctionCall (FunctionCall m _) = Just m
extractFunctionCall _ = Nothing
isMainFunction :: Expr -> Bool
isMainFunction e = isJust $ extractMain e
isFunctionCall :: Expr -> Bool
isFunctionCall e = isJust $ extractFunctionCall e
{--
getInnerMainFunctionString :: [Expr] -> String -> String
getInnerMainFunctionString e instanceName = do
if(isMainFunction (e!!0)) then
show (e!!0)
else
getInnerMainFunctionString (drop 1 e) instanceName
--}
getFunctionString :: [Expr] -> String
getFunctionString e = do
if(isMainFunction (e!!0)) then
""
else
"" ++ show (e!!0) ++ getFunctionString (drop 1 e)
Code Example
module IndentationTest
testFunction : int -> void
testFunction x =
if(x < 50)
println x;
nextX :int = x + 1 * 2 - 3 / 2 + 5
testFunction (nextX)
else
last :int = 1000
println last;
main : String -> IO
main args =
x :int = 3
y :int = 10
z :int = 15
arrTest:[int] = [x y z]
println arrTest;
testFunction (x)
stringTest :String = "Helloworld";
This will successfully parse the example code. Just pass it into the parsePrint function.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With