Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Indentation using Megaparsec

I would like to parse a basic indented language using Megaparsec. Originally I was using Parsec which I managed to get working correctly with indentation but now I'm having quite some trouble.

I've been following a tutorial here and here's the code I have to parse a language ignoring indentation.

module Parser where

import           Data.Functor                  ((<$>), (<$))
import           Control.Applicative           (Applicative(..))
import qualified Control.Monad                 as M
import Control.Monad (void)
import           Data.Functor.Identity
import           Data.Text                     (Text)
import qualified Data.Text                     as Text

import Data.Void

import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Megaparsec.Perm
import Text.Megaparsec.Expr
import qualified Text.Megaparsec.Char.Lexer as L

import Text.Pretty.Simple
import Data.Either.Unwrap

--import Lexer
import Syntax

type Parser = Parsec Void String

lineComment :: Parser ()
lineComment = L.skipLineComment "#"

scn :: Parser ()
scn = L.space space1 lineComment empty

sc :: Parser () -- ‘sc’ stands for “space consumer”
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

symbol :: String -> Parser String
symbol = L.symbol sc

integer :: Parser Integer
integer = lexeme L.decimal


semi :: Parser String
semi = symbol ";"

rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)

rws :: [String] -- list of reserved words
rws = ["if","then","else","while","do","skip","true","false","not","and","or"]

identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x


parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")


whileParser :: Parser Stmt
whileParser = between sc eof stmt

stmt :: Parser Stmt
stmt = f <$> sepBy1 stmt' semi
  where
    -- if there's only one stmt return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l

stmt' :: Parser Stmt
stmt' = ifStmt
  <|> whileStmt
  <|> skipStmt
  <|> assignStmt
  <|> parens stmt

ifStmt :: Parser Stmt
ifStmt = do
    rword "if"
    cond  <- bExpr
    rword "then"
    stmt1 <- stmt
    rword "else"
    stmt2 <- stmt
    return (If cond stmt1 stmt2)

whileStmt :: Parser Stmt
whileStmt = do
  rword "while"
  cond <- bExpr
  rword "do"
  stmt1 <- stmt
  return (While cond stmt1)

assignStmt :: Parser Stmt
assignStmt = do
  var  <- identifier
  void (symbol ":=")
  expr <- aExpr
  return (Assign var expr)

skipStmt :: Parser Stmt
skipStmt = Skip <$ rword "skip"

aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators

bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators

aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]

bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]

aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer

bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr

rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)

relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)

parsePrint :: String -> IO()
parsePrint s = do
    parseTest stmt' s

Running this parses correctly.

parsePrint $ unlines
[ "while (true) do if(false) then x := 5 else y := 20"
]

This is the code for parsing indentation from the second tutorial here.

{-# LANGUAGE TupleSections #-}

module Main where

import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L

type Parser = Parsec Void String

lineComment :: Parser ()
lineComment = L.skipLineComment "#"

scn :: Parser ()
scn = L.space space1 lineComment empty

sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

pItem :: Parser String
pItem = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'

pComplexItem :: Parser (String, [String])
pComplexItem = L.indentBlock scn p
  where
    p = do
      header <- pItem
      return (L.IndentMany Nothing (return . (header, )) pLineFold)

pLineFold :: Parser String
pLineFold = L.lineFold scn $ \sc' ->
  let ps = takeWhile1P Nothing f `sepBy1` try sc'
      f x = isAlphaNum x || x == '-'
  in unwords <$> ps <* sc

pItemList :: Parser (String, [(String, [String])])
pItemList = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      header <- pItem
      return (L.IndentSome Nothing (return . (header, )) pComplexItem)

parser :: Parser (String, [(String, [String])])
parser = pItemList <* eof

main :: IO ()
main = return ()

I would like as an example for this to parse correctly.

parsePrint $ unlines
[ "while (true) do" 
, "    if(false) then x := 5 else y := 20"
]

How could I parse indentation correctly? Also are there any other places with tutorials/documentation on using Megaparsec?

like image 952
Michael Avatar asked Jan 15 '18 21:01

Michael


1 Answers

After spending a lot of time on this over the last couple of weeks I managed to work it out. It was a matter of moving from using strings to using my own "Expr" data type.
For anybody else who would like to start writing an indented language this code could be a good start!

Parser

{-# LANGUAGE TupleSections #-}

module IndentTest where

import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Expr

import Block

type Parser = Parsec Void String

-- Tokens


lineComment :: Parser ()
lineComment = L.skipLineComment "#"


scn :: Parser ()
scn = L.space space1 lineComment empty


sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'


symbol :: String -> Parser String
symbol = L.symbol sc


rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)


rws :: [String] -- list of reserved words
rws = ["module", "println", "import",  "let", "if","then","else","while","do","skip","true","false","not","and","or"]


word :: Parser String
word = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> alphaNumChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an word"
                else return x

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc


integer :: Parser Integer
integer = lexeme L.decimal


parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")


aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer


aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]


aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators


assignArith :: Parser Expr
assignArith = do
  var  <- identifier
  symbol ":"
  vType <- valType
  symbol "="
  e <- aExpr
  return $ AssignArith vType var e


bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr


bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]


bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators


rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)


relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)


identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x


stringLiteral :: Parser Expr
stringLiteral = do
  value <- char '"' >> manyTill L.charLiteral (char '"')
  symbol ";"
  return $ StringLiteral value


assignString :: Parser Expr
assignString = do
  var  <- identifier
  symbol ":"
  vType <- valType
  symbol "="
  e <- stringLiteral
  return (AssignString vType var e)


arrayDef :: Parser Expr
arrayDef = do
  name <- identifier
  symbol ":"

  symbol "["
  arrType <- word
  symbol "]"

  symbol "="
  return $ ArrayDef arrType name

arrayValues :: Parser Expr
arrayValues = do
  symbol "["
  values <- many identifier
  symbol "]"
  return $ ArrayValues values

arrayAssign :: Parser Expr
arrayAssign = do
  def <- arrayDef
  values <- arrayValues
  return $ ArrayAssignment def values

arrayElementSelect :: Parser Expr
arrayElementSelect = do
  symbol "!!"
  elementNum <- word
  return $ ArrayElementSelect elementNum


moduleParser :: Parser Expr
moduleParser = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      rword "module"
      name <- identifier
      return (L.IndentSome Nothing (return . (Module name)) expr')


valType :: Parser Expr
valType = do
    value <- identifier
    return $ Type value


argumentType :: Parser Expr
argumentType = do
    value <- identifier
    return $ ArgumentType value


returnType :: Parser Expr
returnType = do
    value <- identifier
    return $ ReturnType value


argument :: Parser Expr
argument = do
  value <- identifier
  return $ Argument value


-- Function parser
functionParser :: Parser Expr
functionParser = L.indentBlock scn p
  where
    p = do
      name <- identifier
      symbol ":"
      argTypes <- some argumentType
      symbol "->"
      rType <- IndentTest.returnType
      nameDup <- L.lineFold scn $ \sp' ->
        (identifier) `sepBy1` try sp' <* scn
      args <- many argument
      symbol "="
      if(name == "main") then
          return (L.IndentMany Nothing (return . (MainFunction name argTypes args rType)) expr')
      else
          return (L.IndentMany Nothing (return . (Function name argTypes args rType)) expr')



functionCallParser :: Parser Expr
functionCallParser = do
  name <- identifier
  args <- parens $ many argument
  return $ FunctionCall name args


printParser :: Parser Expr
printParser = do
  rword "println"
  bodyArr <- identifier
  symbol ";"
  return $ Print bodyArr


valueToken :: Parser String
valueToken = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'


ifStmt :: Parser Expr
ifStmt = L.indentBlock scn p
   where
     p = do
       rword "if"
       cond  <- bExpr
       return (L.IndentMany Nothing (return . (If cond)) expr')

elseStmt :: Parser Expr
elseStmt = L.indentBlock scn p
   where
     p = do
       rword "else"
       return (L.IndentMany Nothing (return . (Else)) expr')

whereStmt :: Parser Expr
whereStmt = do
  rword "where"
  symbol "{"
  exprs <- many expr
  symbol "}"
  return $ (Where exprs)


expr :: Parser Expr
expr = f <$> sepBy1 expr' (symbol ";")
  where
    -- if there's only one expr return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l


expr' :: Parser Expr
expr' = try moduleParser
  <|> try functionParser
  <|> try ifStmt
  <|> try elseStmt
  <|> try arrayAssign
  <|> arrayElementSelect
  <|> try assignArith
  <|> try functionCallParser
  <|> try assignString
  <|> try printParser
  <|> try whereStmt
  <|> try stringLiteral


parser :: Parser Expr
parser = expr'


parseFromFile file = runParser expr file <$> readFile file


parseString input =
  case parse expr' "" input of
    Left  e -> show e
    Right x -> show x


parsePrint :: String -> IO()
parsePrint s = parseTest' parser s

Block/Expr - The AST consists of this

module Block where

import Data.List
import Text.Show.Functions
import Data.Char
import Data.Maybe

-- Boolean expressions
data BExpr
  = BoolConst Bool
  | Not BExpr
  | BBinary BBinOp BExpr BExpr
  | RBinary RBinOp AExpr AExpr

instance Show BExpr where
    show (BoolConst b) = lowerString $ show b
    show (Not n) = show n
    show (BBinary bbinop bExpr1 bExpr2) = show bExpr1 ++ " " ++ show bbinop ++ " " ++ show bExpr2
    show (RBinary rbinop aExpr1 aExpr2) = show aExpr1 ++ " " ++ show rbinop ++ " " ++ show aExpr2


-- Boolean ops
data BBinOp
  = And
  | Or

instance Show BBinOp where
    show (And) = "&&"
    show (Or) = "||"

-- R binary ops
data RBinOp
  = Greater
  | Less

instance Show RBinOp where
    show (Greater) = ">"
    show (Less) = "<"

-- Arithmetic expressions
data AExpr
  = Var String
  | IntConst Integer
  | Neg AExpr
  | ABinary ABinOp AExpr AExpr
  | Parenthesis AExpr

instance Show AExpr where
    show (Var v) = v
    show (IntConst i) = show i
    show (Neg aExpr) = "-" ++ show aExpr
    show (ABinary aBinOp aExpr1 aExpr2) = show aExpr1 ++ " " ++ show aBinOp ++ " " ++ show aExpr2
    show (Parenthesis aExpr) = "(" ++ show aExpr ++ ")"

-- Arithmetic ops
data ABinOp
  = OpeningParenthesis
  | ClosingParenthesis
  | Add
  | Subtract
  | Multiply
  | Divide

instance Show ABinOp where
    show (Add) = "+"
    show (Subtract) = "-"
    show (Multiply) = "*"
    show (Divide) = "/"
    show (OpeningParenthesis) = "("
    show (ClosingParenthesis) = ")"

-- Statements
data Expr
  = Seq [Expr]
  | Module String [Expr]
  | Import String String
  | MainFunction {name ::String, argTypes:: [Expr], args::[Expr], returnType::Expr, body::[Expr]}
  | Function String [Expr] [Expr] Expr [Expr]
  | FunctionCall String [Expr]
  | Type String
  | ValueType String
  | Argument String
  | ArgumentType String
  | ReturnType String
  | AssignArith Expr String AExpr
  | AssignString Expr String Expr
  | If BExpr [Expr]
  | Else [Expr]
  | While BExpr [Expr]
  | Print String
  | Return Expr
  | ArrayValues [String]
  | ArrayDef String String
  | ArrayAssignment Expr Expr
  | ArrayElementSelect String
  | Lambda String String
  | Where [Expr]
  | StringLiteral String
  | Skip

instance Show Expr where
    show (Module name bodyArray) =
        -- Get the main function tree

        "public class " ++ name ++ "{\n" ++
            "public static void main(String[] args){\n" ++
                name ++ " " ++ lowerString name ++ "= new " ++ name ++ "();\n" ++
                intercalate "\n" (map (\mStatement -> if(isFunctionCall mStatement) then (lowerString name ++ "." ++ show mStatement) else show mStatement) (body ((filter (isMainFunction) bodyArray)!!0))) ++
            "}\n" ++
            getFunctionString bodyArray ++
        "}\n"

    show (Import directory moduleName) = "import " ++ directory ++ moduleName
    show (Function name argTypes args returnType body) = "public " ++ show returnType ++ " " ++ name ++ "("++ intercalate ", " (zipWith (\x y -> x ++ " " ++ y) (map show argTypes) (map show args)) ++"){\n" ++ intercalate "\n" (map show body) ++ "}"
    show (MainFunction name argTypes args returnType body) =
        intercalate "\n " $ map show body
    show (FunctionCall name exprs) = name ++ "(" ++ (intercalate ", " (map show exprs)) ++ ");"
    show (Type b) = b
    show (Argument b) = b
    show (ArgumentType b) = b
    show (ReturnType b) = b
    show (AssignArith vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
    show (AssignString vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
    show (If condition statement) = "if(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (Else statement) = " else {\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (While condition statement) = "while(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (Skip) = "[skip]"
    show (Seq s) = "[seq]"
    show (Return expr) = "return " ++ show expr ++ ";"
    show (Print exprs) = "System.out.println(" ++ exprs ++ ");" --"System.out.println(" ++ intercalate " " (map show exprs) ++ ");"
    show (ArrayDef arrType name) = arrType ++ "[] " ++ name ++ "="
    show (ArrayValues exprs) = "{" ++ intercalate ", " exprs ++ "};"
    show (ArrayAssignment arr values) = show arr ++ show values
    show (ArrayElementSelect i) = "[" ++ i ++ "];"
    show (Lambda valName collectionName) = ""
    show (Where exprs) = intercalate "\n" (map show exprs)
    show (StringLiteral value) = "\"" ++ value ++ "\""
    show (_) = "<unknown>"

lowerString str = [ toLower loweredString | loweredString <- str]

extractMain :: Expr -> Maybe String
extractMain (MainFunction m _ _ _ _) = Just m
extractMain _ = Nothing

extractFunctionCall :: Expr -> Maybe String
extractFunctionCall (FunctionCall m _) = Just m
extractFunctionCall _ = Nothing

isMainFunction :: Expr -> Bool
isMainFunction e = isJust $ extractMain e

isFunctionCall :: Expr -> Bool
isFunctionCall e = isJust $ extractFunctionCall e

{--
getInnerMainFunctionString :: [Expr] -> String -> String
getInnerMainFunctionString e instanceName  = do
    if(isMainFunction (e!!0)) then
      show (e!!0)
    else
      getInnerMainFunctionString (drop 1 e) instanceName
--}
getFunctionString :: [Expr] -> String
getFunctionString e = do
    if(isMainFunction (e!!0)) then
      ""
    else
      "" ++ show (e!!0) ++ getFunctionString (drop 1 e)

Code Example

module IndentationTest
    testFunction : int -> void
    testFunction x =
        if(x < 50)
            println x;
            nextX :int = x + 1 * 2 - 3 / 2 + 5
            testFunction (nextX)
        else
            last :int = 1000
            println last;

    main : String -> IO
    main args =
        x :int = 3
        y :int = 10
        z :int = 15
        arrTest:[int] = [x y z]
        println arrTest;
        testFunction (x)
        stringTest :String = "Helloworld";

This will successfully parse the example code. Just pass it into the parsePrint function.

like image 118
Michael Avatar answered Oct 19 '22 22:10

Michael