I would like to parse a basic indented language using Megaparsec. Originally I was using Parsec which I managed to get working correctly with indentation but now I'm having quite some trouble.
I've been following a tutorial here and here's the code I have to parse a language ignoring indentation.
module Parser where
import           Data.Functor                  ((<$>), (<$))
import           Control.Applicative           (Applicative(..))
import qualified Control.Monad                 as M
import Control.Monad (void)
import           Data.Functor.Identity
import           Data.Text                     (Text)
import qualified Data.Text                     as Text
import Data.Void
import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Megaparsec.Perm
import Text.Megaparsec.Expr
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Pretty.Simple
import Data.Either.Unwrap
--import Lexer
import Syntax
type Parser = Parsec Void String
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser () -- ‘sc’ stands for “space consumer”
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
symbol :: String -> Parser String
symbol = L.symbol sc
integer :: Parser Integer
integer = lexeme L.decimal
semi :: Parser String
semi = symbol ";"
rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)
rws :: [String] -- list of reserved words
rws = ["if","then","else","while","do","skip","true","false","not","and","or"]
identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x
parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")
whileParser :: Parser Stmt
whileParser = between sc eof stmt
stmt :: Parser Stmt
stmt = f <$> sepBy1 stmt' semi
  where
    -- if there's only one stmt return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l
stmt' :: Parser Stmt
stmt' = ifStmt
  <|> whileStmt
  <|> skipStmt
  <|> assignStmt
  <|> parens stmt
ifStmt :: Parser Stmt
ifStmt = do
    rword "if"
    cond  <- bExpr
    rword "then"
    stmt1 <- stmt
    rword "else"
    stmt2 <- stmt
    return (If cond stmt1 stmt2)
whileStmt :: Parser Stmt
whileStmt = do
  rword "while"
  cond <- bExpr
  rword "do"
  stmt1 <- stmt
  return (While cond stmt1)
assignStmt :: Parser Stmt
assignStmt = do
  var  <- identifier
  void (symbol ":=")
  expr <- aExpr
  return (Assign var expr)
skipStmt :: Parser Stmt
skipStmt = Skip <$ rword "skip"
aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators
bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators
aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]
bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]
aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer
bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr
rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)
relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)
parsePrint :: String -> IO()
parsePrint s = do
    parseTest stmt' s
Running this parses correctly.
parsePrint $ unlines
[ "while (true) do if(false) then x := 5 else y := 20"
]
This is the code for parsing indentation from the second tutorial here.
{-# LANGUAGE TupleSections #-}
module Main where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
type Parser = Parsec Void String
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
pItem :: Parser String
pItem = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'
pComplexItem :: Parser (String, [String])
pComplexItem = L.indentBlock scn p
  where
    p = do
      header <- pItem
      return (L.IndentMany Nothing (return . (header, )) pLineFold)
pLineFold :: Parser String
pLineFold = L.lineFold scn $ \sc' ->
  let ps = takeWhile1P Nothing f `sepBy1` try sc'
      f x = isAlphaNum x || x == '-'
  in unwords <$> ps <* sc
pItemList :: Parser (String, [(String, [String])])
pItemList = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      header <- pItem
      return (L.IndentSome Nothing (return . (header, )) pComplexItem)
parser :: Parser (String, [(String, [String])])
parser = pItemList <* eof
main :: IO ()
main = return ()
I would like as an example for this to parse correctly.
parsePrint $ unlines
[ "while (true) do" 
, "    if(false) then x := 5 else y := 20"
]
How could I parse indentation correctly? Also are there any other places with tutorials/documentation on using Megaparsec?
After spending a lot of time on this over the last couple of weeks I managed to work it out. It was a matter of moving from using strings to using my own "Expr" data type.
For anybody else who would like to start writing an indented language this code could be a good start!
Parser
{-# LANGUAGE TupleSections #-}
module IndentTest where
import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Expr
import Block
type Parser = Parsec Void String
-- Tokens
lineComment :: Parser ()
lineComment = L.skipLineComment "#"
scn :: Parser ()
scn = L.space space1 lineComment empty
sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'
symbol :: String -> Parser String
symbol = L.symbol sc
rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)
rws :: [String] -- list of reserved words
rws = ["module", "println", "import",  "let", "if","then","else","while","do","skip","true","false","not","and","or"]
word :: Parser String
word = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> alphaNumChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an word"
                else return x
lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc
integer :: Parser Integer
integer = lexeme L.decimal
parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")
aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer
aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]
aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators
assignArith :: Parser Expr
assignArith = do
  var  <- identifier
  symbol ":"
  vType <- valType
  symbol "="
  e <- aExpr
  return $ AssignArith vType var e
bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr
bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]
bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators
rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)
relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)
identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x
stringLiteral :: Parser Expr
stringLiteral = do
  value <- char '"' >> manyTill L.charLiteral (char '"')
  symbol ";"
  return $ StringLiteral value
assignString :: Parser Expr
assignString = do
  var  <- identifier
  symbol ":"
  vType <- valType
  symbol "="
  e <- stringLiteral
  return (AssignString vType var e)
arrayDef :: Parser Expr
arrayDef = do
  name <- identifier
  symbol ":"
  symbol "["
  arrType <- word
  symbol "]"
  symbol "="
  return $ ArrayDef arrType name
arrayValues :: Parser Expr
arrayValues = do
  symbol "["
  values <- many identifier
  symbol "]"
  return $ ArrayValues values
arrayAssign :: Parser Expr
arrayAssign = do
  def <- arrayDef
  values <- arrayValues
  return $ ArrayAssignment def values
arrayElementSelect :: Parser Expr
arrayElementSelect = do
  symbol "!!"
  elementNum <- word
  return $ ArrayElementSelect elementNum
moduleParser :: Parser Expr
moduleParser = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      rword "module"
      name <- identifier
      return (L.IndentSome Nothing (return . (Module name)) expr')
valType :: Parser Expr
valType = do
    value <- identifier
    return $ Type value
argumentType :: Parser Expr
argumentType = do
    value <- identifier
    return $ ArgumentType value
returnType :: Parser Expr
returnType = do
    value <- identifier
    return $ ReturnType value
argument :: Parser Expr
argument = do
  value <- identifier
  return $ Argument value
-- Function parser
functionParser :: Parser Expr
functionParser = L.indentBlock scn p
  where
    p = do
      name <- identifier
      symbol ":"
      argTypes <- some argumentType
      symbol "->"
      rType <- IndentTest.returnType
      nameDup <- L.lineFold scn $ \sp' ->
        (identifier) `sepBy1` try sp' <* scn
      args <- many argument
      symbol "="
      if(name == "main") then
          return (L.IndentMany Nothing (return . (MainFunction name argTypes args rType)) expr')
      else
          return (L.IndentMany Nothing (return . (Function name argTypes args rType)) expr')
functionCallParser :: Parser Expr
functionCallParser = do
  name <- identifier
  args <- parens $ many argument
  return $ FunctionCall name args
printParser :: Parser Expr
printParser = do
  rword "println"
  bodyArr <- identifier
  symbol ";"
  return $ Print bodyArr
valueToken :: Parser String
valueToken = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'
ifStmt :: Parser Expr
ifStmt = L.indentBlock scn p
   where
     p = do
       rword "if"
       cond  <- bExpr
       return (L.IndentMany Nothing (return . (If cond)) expr')
elseStmt :: Parser Expr
elseStmt = L.indentBlock scn p
   where
     p = do
       rword "else"
       return (L.IndentMany Nothing (return . (Else)) expr')
whereStmt :: Parser Expr
whereStmt = do
  rword "where"
  symbol "{"
  exprs <- many expr
  symbol "}"
  return $ (Where exprs)
expr :: Parser Expr
expr = f <$> sepBy1 expr' (symbol ";")
  where
    -- if there's only one expr return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l
expr' :: Parser Expr
expr' = try moduleParser
  <|> try functionParser
  <|> try ifStmt
  <|> try elseStmt
  <|> try arrayAssign
  <|> arrayElementSelect
  <|> try assignArith
  <|> try functionCallParser
  <|> try assignString
  <|> try printParser
  <|> try whereStmt
  <|> try stringLiteral
parser :: Parser Expr
parser = expr'
parseFromFile file = runParser expr file <$> readFile file
parseString input =
  case parse expr' "" input of
    Left  e -> show e
    Right x -> show x
parsePrint :: String -> IO()
parsePrint s = parseTest' parser s
Block/Expr - The AST consists of this
module Block where
import Data.List
import Text.Show.Functions
import Data.Char
import Data.Maybe
-- Boolean expressions
data BExpr
  = BoolConst Bool
  | Not BExpr
  | BBinary BBinOp BExpr BExpr
  | RBinary RBinOp AExpr AExpr
instance Show BExpr where
    show (BoolConst b) = lowerString $ show b
    show (Not n) = show n
    show (BBinary bbinop bExpr1 bExpr2) = show bExpr1 ++ " " ++ show bbinop ++ " " ++ show bExpr2
    show (RBinary rbinop aExpr1 aExpr2) = show aExpr1 ++ " " ++ show rbinop ++ " " ++ show aExpr2
-- Boolean ops
data BBinOp
  = And
  | Or
instance Show BBinOp where
    show (And) = "&&"
    show (Or) = "||"
-- R binary ops
data RBinOp
  = Greater
  | Less
instance Show RBinOp where
    show (Greater) = ">"
    show (Less) = "<"
-- Arithmetic expressions
data AExpr
  = Var String
  | IntConst Integer
  | Neg AExpr
  | ABinary ABinOp AExpr AExpr
  | Parenthesis AExpr
instance Show AExpr where
    show (Var v) = v
    show (IntConst i) = show i
    show (Neg aExpr) = "-" ++ show aExpr
    show (ABinary aBinOp aExpr1 aExpr2) = show aExpr1 ++ " " ++ show aBinOp ++ " " ++ show aExpr2
    show (Parenthesis aExpr) = "(" ++ show aExpr ++ ")"
-- Arithmetic ops
data ABinOp
  = OpeningParenthesis
  | ClosingParenthesis
  | Add
  | Subtract
  | Multiply
  | Divide
instance Show ABinOp where
    show (Add) = "+"
    show (Subtract) = "-"
    show (Multiply) = "*"
    show (Divide) = "/"
    show (OpeningParenthesis) = "("
    show (ClosingParenthesis) = ")"
-- Statements
data Expr
  = Seq [Expr]
  | Module String [Expr]
  | Import String String
  | MainFunction {name ::String, argTypes:: [Expr], args::[Expr], returnType::Expr, body::[Expr]}
  | Function String [Expr] [Expr] Expr [Expr]
  | FunctionCall String [Expr]
  | Type String
  | ValueType String
  | Argument String
  | ArgumentType String
  | ReturnType String
  | AssignArith Expr String AExpr
  | AssignString Expr String Expr
  | If BExpr [Expr]
  | Else [Expr]
  | While BExpr [Expr]
  | Print String
  | Return Expr
  | ArrayValues [String]
  | ArrayDef String String
  | ArrayAssignment Expr Expr
  | ArrayElementSelect String
  | Lambda String String
  | Where [Expr]
  | StringLiteral String
  | Skip
instance Show Expr where
    show (Module name bodyArray) =
        -- Get the main function tree
        "public class " ++ name ++ "{\n" ++
            "public static void main(String[] args){\n" ++
                name ++ " " ++ lowerString name ++ "= new " ++ name ++ "();\n" ++
                intercalate "\n" (map (\mStatement -> if(isFunctionCall mStatement) then (lowerString name ++ "." ++ show mStatement) else show mStatement) (body ((filter (isMainFunction) bodyArray)!!0))) ++
            "}\n" ++
            getFunctionString bodyArray ++
        "}\n"
    show (Import directory moduleName) = "import " ++ directory ++ moduleName
    show (Function name argTypes args returnType body) = "public " ++ show returnType ++ " " ++ name ++ "("++ intercalate ", " (zipWith (\x y -> x ++ " " ++ y) (map show argTypes) (map show args)) ++"){\n" ++ intercalate "\n" (map show body) ++ "}"
    show (MainFunction name argTypes args returnType body) =
        intercalate "\n " $ map show body
    show (FunctionCall name exprs) = name ++ "(" ++ (intercalate ", " (map show exprs)) ++ ");"
    show (Type b) = b
    show (Argument b) = b
    show (ArgumentType b) = b
    show (ReturnType b) = b
    show (AssignArith vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
    show (AssignString vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
    show (If condition statement) = "if(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (Else statement) = " else {\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (While condition statement) = "while(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (Skip) = "[skip]"
    show (Seq s) = "[seq]"
    show (Return expr) = "return " ++ show expr ++ ";"
    show (Print exprs) = "System.out.println(" ++ exprs ++ ");" --"System.out.println(" ++ intercalate " " (map show exprs) ++ ");"
    show (ArrayDef arrType name) = arrType ++ "[] " ++ name ++ "="
    show (ArrayValues exprs) = "{" ++ intercalate ", " exprs ++ "};"
    show (ArrayAssignment arr values) = show arr ++ show values
    show (ArrayElementSelect i) = "[" ++ i ++ "];"
    show (Lambda valName collectionName) = ""
    show (Where exprs) = intercalate "\n" (map show exprs)
    show (StringLiteral value) = "\"" ++ value ++ "\""
    show (_) = "<unknown>"
lowerString str = [ toLower loweredString | loweredString <- str]
extractMain :: Expr -> Maybe String
extractMain (MainFunction m _ _ _ _) = Just m
extractMain _ = Nothing
extractFunctionCall :: Expr -> Maybe String
extractFunctionCall (FunctionCall m _) = Just m
extractFunctionCall _ = Nothing
isMainFunction :: Expr -> Bool
isMainFunction e = isJust $ extractMain e
isFunctionCall :: Expr -> Bool
isFunctionCall e = isJust $ extractFunctionCall e
{--
getInnerMainFunctionString :: [Expr] -> String -> String
getInnerMainFunctionString e instanceName  = do
    if(isMainFunction (e!!0)) then
      show (e!!0)
    else
      getInnerMainFunctionString (drop 1 e) instanceName
--}
getFunctionString :: [Expr] -> String
getFunctionString e = do
    if(isMainFunction (e!!0)) then
      ""
    else
      "" ++ show (e!!0) ++ getFunctionString (drop 1 e)
Code Example
module IndentationTest
    testFunction : int -> void
    testFunction x =
        if(x < 50)
            println x;
            nextX :int = x + 1 * 2 - 3 / 2 + 5
            testFunction (nextX)
        else
            last :int = 1000
            println last;
    main : String -> IO
    main args =
        x :int = 3
        y :int = 10
        z :int = 15
        arrTest:[int] = [x y z]
        println arrTest;
        testFunction (x)
        stringTest :String = "Helloworld";
This will successfully parse the example code. Just pass it into the parsePrint function.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With