Aller au contenu

Module:FParser

Depi Wikipedya, ansiklopedi lib

La documentation pour ce module peut être créée à Module:FParser/doc

local lexer = {}
local parser = {}

--[[ These parser functions are generic functions to build a parser.
It works with "Lexing" and "Parsing" functions. 
The parser is initialized by the "parse" function, who generates a "state" object that must be a parameter of each parsing functions,
                                         and eventually returns the main node of the AST if everything goes well
* Lexing functions have one parameter, the state of the parser, and returns a modified state 
  if they could find the terminals they are supposed to recognize and  or nothing if the parse fails. 
* Parsing functions always have a state as unique parameter. 
  They can be divided into
  * Generic one, written in this module that corresponds that helps to build a parser 
    but don't generate nodes of the AST themselves, like "alternative", "chain", "star" or "plus"  
    * "chain" corresponds to the concatenation operation in a grammar. for example a function that parses the EBNF rule
            twelve  = "1", "2" ;
      will be generated by chain{lex_char("1"), lex_char("2")}
    * "alternative" corresponds to the alternative operation in a grammar, for example a function that parses the EBNF rule 
            digit_excluding_zero = "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
       will be written alternative{lex_char("1"), lex_char("2"), lex_char("3"), lex_char("4"), lex_char("5"), lex_char("6"), 
                                   lex_char("7"), lex_char("8"), lex_char("9")}
  * User written one, that are functions that must generate a node of the AST as a "node" attribute of the "state" object.
    To do that they must use attributes of the state object filled by the generic one like 
    * "parsed", the string parsed by the last lexing function,
    * "acc", the list of nodes that are generated by the function that are passed to function that iterates their call namely "star" and "plus" 
    They return nothing if the parse fails, or the state of the last state returned by a lexer function called if they don't.

Other functions are
* maybe : a function to avoid checking if the state is nil every time : if the state is in a fail state, 
  it does not apply the parsing or lexing function. Maybe allows to chain the application of lexing and parsing function easily.
  attribute of the "newstate" object to be able to access the "acc" or "node" of the previous object in a chain, for example.
* idop : a function that takes a function as parameter that is used to compute an action on the current state in a chain of parsing function, 
  mainly modifying variables in the closure. Functions passed to idop returns nothing. Usable for example for logging.
* statemodop : a function that takes a function as parameter ; who computes a node in the AST from the state and the variables of the closure and typically adding 
               it to the state.
               Those functions have the same signatures than parsing functions, but typically they do not parse anything.
--]]

--------------------------------------------------------------------------------
-- lexer
--------------------------------------------------------------------------------

local function lex_regex(state, regex)

	local res = string.match(state.newstate.str, "^" .. regex)
	if res then
		local newstate = {}
		local len = string.len(res)
		newstate.str = state.newstate.str:sub(len + 1)
		newstate.pos = state.newstate.pos + len
		return {
			['lexed'] = res, 
			["newstate"] = newstate,
			["node"] = state.node -- forwarding node in case it's not consumed 
                                              -- case of lexing a closing parenthesis after the node creation.
                                              -- this is to avoid to have to create a closure variable
		}
	else
		return nil
	end
end

lexer.lex_regex = lex_regex

local function lex_epsilon(state)
	return lex_regex(state, "")
end

--[[
Tests: p.parse("a", p.chain{p.lexer.lex_epsilon, p.lex_char("a"),p.lexer.lex_epsilon})
--]]

lexer.lex_epsilon = lex_epsilon



local lex_char = function(char)
	return function(state)
		return lex_regex(state, char)
	end
end

-- tested with "p.parse("/", p.lex_char('/'))"
lexer.lex_char = lex_char


function lexer.open_parenthesis(state)
	return lex_regex(state, "[(]")
end

function lexer.close_parenthesis(state)
	return lex_regex(state, "[)]")
end

function lexer.lex_integer(state)
	return lex_regex(state, "[0-9]+")
end


function lexer.eat_blanks(state)
	return lex_regex(state,' *')
end

parser.lexer = lexer

-------------------------------------------------------
-- generic parser property
----------------------------------------------------------


local function maybe(func)
	return function(state)
		if state then
			return func(state)
		end
	end
end

parser.maybe = maybe

local function accumulate(aggregate_func)
	return function(state)
		return {
			["newstate"]=state.newstate, 
			["node"]= aggregate_func(state.acc)
		}
	end
end

parser.accumulate = accumulate


local function map_node(map_func)
	return function(state)
		if state then
			return {
				["newstate"] = state.newstate, 
				["node"] = map_func(state.node)
			}
		end
	end
end

parser.map_node = map_node

local function idop(func)
	return function(state)
		if state then
			func(state)
			return state
		end
	end
end

parser.idop = idop


-- logs
local show = function (string) return idop(function(state) mw.log(string) end) end
parser.log = show
parser.show = show

local dump_state =  idop(function(state) mw.logObject(state) end)
parser.dump_state = dump_state


-- this allows avoiding to pass the state beetween each functions if they were called by hand
local function chain(funcs)
	return function(state)
		local i = 1
		local res = funcs[1](state)
		
		while i < #funcs and res do
			i = i+1
			if funcs[i] == nil then
				error("a nil func in chain")
			end
			res = funcs[i](res)
		end
		return res
	end
end


--[[
Tests : 
p.parse("aba", p.chain{p.lex_char('a'), p.lex_char('b'), p.lex_char('a')}) => yes
p.parse("aba", p.chain{p.lex_char('a'), p.lex_char('b')}) => nope
p.parse("aba", p.chain{p.lex_char('a'), p.lex_char('b'), p.lex_char('a'), p.lex_char('a')}) => nope
--]]

parser.chain = chain

-- higher order function that can parse an alternative between several non terminals.
-- returns the state of the first match
local function alternative(funcs)
	return function(state)
		local i = 1
		while i <= #funcs do
			local res = funcs[i](state)
			if res then 
				return res 
			end
			i=i+1
		end
	end
end

--[[
Tests : 
p.parse("a", p.alternative{p.lex_char('a'), p.lex_char('b')}) => yes
p.parse("b", p.alternative{p.lex_char('a'), p.lex_char('b')}) => yes
p.parse("c", p.alternative{p.lex_char('a'), p.lex_char('b')}) => nope

--]]

parser.alternative = alternative


local function star(parsing_func)
	local function star_rec(state, acc, i)

		local res = chain{
			parsing_func,
			idop(
				function (stat)
					table.insert(acc, stat.node)
				end
			),
		}(state)
	
		if res then
			return star_rec(res, acc, i + 1)
		else
			return state, acc
		end
	end

	return function(state)
		if state then
			local acc = {}
			local result, acc2 = star_rec(state, acc, 1)
			result.acc = acc2
			return result
		end
	end
end

--[[
Tests: p.parse("aaab", p.chain{p.star(p.lex_char("a")), p.lex_char("b")}) => yes
p.parse("b", p.chain{p.star(p.lex_char("a")), p.lex_char("b")}) => yes
--]]

parser.star = star


local function plus(parsing_func)
	return function(state)
		local firstnode
		local acc
		
		return chain{
			parsing_func,
			idop(
				function(state)
					firstnode = state.node
				end
			),
			star(parsing_func),
			function(state)
				acc = state.acc
				table.insert(acc, 1, firstnode)
				state.acc = acc
				return state
			end
		}(state)
	end
end


--[[
res = p.parse("aaab", 
              p.chain{
                    p.plus(
                        p.chain{ 
                              p.lex_char("a"), 
                              p.statemodop(function(res) res.node="a" ;  return res; end)
                        }
                    ), 
                    p.lex_char("b")
              }
             )
--]]

parser.plus = plus


--[[
Tests : 
p.parse("a", p.chain{p.lex_char('a'), p.idop(function (state) end )}) => yes
p.parse("ab", p.chain{p.lex_char('a'), p.idop(function (state) end), p.lex_char('b') }) => nope
p.parse("ab", p.chain{p.lex_char('a'), p.idop(function (state) end), p.lex_char('a') }) => nope
--]]


local function questionmark(parsing_func)
	return function(state)
		local node = nil
		local res=alternative{
			chain{
				parsing_func,
				idop(function (stat) node = stat.node end)
			},
			lex_epsilon
		}(state)
		res.node = node
		return res
	end
end

parser.questionmark = questionmark

--------------------------------------------------------------------------------
-- main function that launches the parsing
--------------------------------------------------------------------------------

parser.parse = function (string, base_parsing_function)
	local state = {}
	
	state.newstate = {}
	state.newstate.str = string
	state.newstate.pos = 1
	
	local res_node = nil
		
	local res = chain{
		base_parsing_function,
		idop(function(stat) res_node = stat.node end),
		lexer.eat_blanks
	}(state)

	if res and res.newstate.str=="" then
		return res_node
	end
	return nil
end

-- a function to create an AST node which represents an nary operator. Could parse an expression on the form « 1 + 2 + 3 + 4 » and return a « sum » AST node wich have a list of nodes for operands sum([number(1), number(2), number(3) number(4) ) in our example

local function nary_op_parser (
	first_node_parser,      -- a parser that parses only the first element of the operation (« 1 »)
	next_nodes_parser,      -- a parser that parses the infix operation and one node ( « + 2 » and the subsequent)
	acc_result_node_creator,-- a function that takes the lists of the node generated by the previous parser and generates the final node
	single_creator          -- optional, a function corner case when the expression has no actual operator one element like « 1 » (one might just want to get a « number(1) » AST node instead of « sum([number(1)]) »
	                        -- if that’s not the wanted behavior, provide a function to build a node specific for that case. Takes a node, the result of the first_node_parser .
)
	return function(state)
		local res
		local firstNode
		
		res = chain{
			first_node_parser,
			idop(
				function(state)
					firstNode = state.node 
				end
			),
			star(
				next_nodes_parser
			)
		}(state)
		
		if res then 
			if res.acc and #(res.acc) > 0 then 
				local acc = res.acc
				table.insert(acc, 1, firstNode)
				res.node = acc_result_node_creator(acc)
			else
				single_creator = single_creator or function(node) return node end
				res.node = single_creator(firstNode)
			end
			
			res.acc = nil
			return res
		end
	end
end

parser.nary_op_parser = nary_op_parser

return parser