--- HTML parsing module for LuaXML -- @module luaxml-mod-html -- @author Michal Hoftich <michal.h21@gmail.com -- Copyright Michal Hoftich, 2022 -- HTML parser inspired by https://browser.engineering/html.html -- but then redone using https://html.spec.whatwg.org/multipage/parsing.html -- -- There main purpose of this module is to create an useful DOM for later processing -- using LuaXML functions. Either for cleanup, or for translation to output formats, -- for example LaTeX. -- -- It should be possible to serialize DOM back to the original HTML code. -- -- We attempt to do some basic fixes, like to close paragraphs or list items that -- aren't closed correctly in the original code. We don't fix tables or -- formatting elements (see https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements) -- as these features don't seem necessary for the purpose of this module. We may change -- this policy in the future, if it turns out that they are necessary. -- -- local M = {} -- use local copies of utf8 functions local ucodepoint = utf8.codepoint local utfchar = utf8.char local function uchar(codepoint) if codepoint and codepoint > -1 then return utfchar(codepoint) end return "" end -- declare namespaces local xmlns = { HTML = "http://www.w3.org/1999/xhtml", MathML = "http://www.w3.org/1998/Math/MathML", SVG = "http://www.w3.org/2000/svg", XLink = "http://www.w3.org/1999/xlink", XML = "http://www.w3.org/XML/1998/namespace", XMLNS = "http://www.w3.org/2000/xmlns/", } -- we must make search tree for named entities, as their support -- is quite messy local named_entities if kpse then named_entities = require "luaxml-namedentities" else named_entities = require "luaxml.namedentities" end local entity_tree = {children = {}} local function update_tree(tree, char) local children = tree.children or {} local current = children[char] or {} children[char] = current tree.children = children return current end -- loop over named entities and update tree for entity, char in pairs(named_entities) do local tree = entity_tree for char in entity:gmatch(".") do tree = update_tree(tree,char) end tree.entity = entity tree.char = char end local function search_entity_tree(tbl) -- get named entity for the list of characters local tree = entity_tree for _,char in ipairs(tbl) do if tree.children then tree = tree.children[char] if not tree then return nil end else return nil end end -- print("tree", tree.char) return tree end -- declare basic node types local Root = { _type = "root", xmlns = xmlns.HTML } function Root:init() local o = {} setmetatable(o, self) self.__index = self self.__tostring = function (x) return "_ROOT" end o.children = {} return o end function Root:add_child(node) table.insert(self.children, node) end local Doctype = { _type = "doctype" } function Doctype:init(name, parent) local o = {} setmetatable(o, self) self.__index = self self.__tostring = function (x) if x.data then return "<!DOCTYPE " .. x.name .. " " .. x.data .. ">" else return "<!DOCTYPE " .. x.name .. ">" end end self.add_child = Root.add_child o.parent = parent o.name = name o.children = {} return o end function Doctype:add_data(data) self.data = data end local Text = { _type = "text" } function Text:init(text, parent) local o = {} setmetatable(o, self) self.__index = self o.text = text self.__tostring = function (x) return "'" .. x.text .. "'" end self.add_child = Root.add_child o.parent = parent o.children = {} return o end local Comment = { _type = "comment" } function Comment:init(text, parent) local o = {} setmetatable(o, self) self.__index = self o.text = text self.__tostring = function (x) return "<!--" .. x.text .. "-->" end self.add_child = Root.add_child o.parent = parent o.children = {} return o end local Element = { _type = "element" } function Element:init(tag, parent) local o = {} setmetatable(o, self) self.__index = self -- tag can be table with unicode characters if type(tag) == "table" then o.tag = table.concat(tag) else o.tag = tag end self.__tostring = function(x) local attr = {} for _, el in ipairs(x.attr) do -- handle attributes local value if el.value:match('"') then value = "'" .. el.value .. "'" else value = '"' .. el.value .. '"' end attr[#attr+1] = el.name .. "=" .. value end local closing = ">" if x.self_closing then closing = " />" end if #attr > 0 then return "<" .. x.tag .. " " .. table.concat(attr, " ") .. closing else return "<" .. x.tag .. closing end end self.add_child = Root.add_child o.children = {} o.attr = {} o.parent = parent -- default xmlns o.xmlns = xmlns.HTML return o end -- state machine functions -- each function takes HtmlParser as an argument local HtmlStates = {} -- declare codepoints for more efficient processing local less_than = ucodepoint("<") local greater_than = ucodepoint(">") local amperesand = ucodepoint("&") local exclam = ucodepoint("!") local question = ucodepoint("?") local solidus = ucodepoint("/") local equals = ucodepoint("=") local quoting = ucodepoint('"') local apostrophe = ucodepoint("'") local semicolon = ucodepoint(";") local hyphen = ucodepoint("-") local dash = ucodepoint("-") local numbersign = ucodepoint("#") local smallx = ucodepoint("x") local bigx = ucodepoint("X") local right_square = ucodepoint("]") local EOF = -1 -- special character, meaning end of stream local null = 0 local function is_upper_alpha(codepoint) if (64 < codepoint and codepoint < 91) then return true end end local function is_lower_alpha(codepoint) if (96 < codepoint and codepoint < 123) then return true end end local function is_alpha(codepoint) -- detect if codepoint is alphanumeric if is_upper_alpha(codepoint) or is_lower_alpha(codepoint) then return true end return false end local function is_numeric(codepoint) if 47 < codepoint and codepoint < 58 then return true end end local function is_upper_hex(codepoint) if 64 < codepoint and codepoint < 71 then return true end end local function is_lower_hex(codepoint) if 96 < codepoint and codepoint < 103 then return true end end local function is_hexadecimal(codepoint) if is_numeric(codepoint) or is_lower_hex(codepoint) or is_upper_hex(codepoint) then return true end end local function is_alphanumeric(codepoint) return is_alpha(codepoint) or is_numeric(codepoint) end local function is_space(codepoint) -- detect space characters if codepoint==0x0009 or codepoint==0x000A or codepoint==0x000C or codepoint==0x0020 then return true end return false end local function is_surrogate(codepoint) return 0xD800 <= codepoint and codepoint <= 0xDFFF end character_entity_replace_table = { [0x80] = 0x20AC, [0x82] = 0x201A, [0x83] = 0x0192, [0x84] = 0x201E, [0x85] = 0x2026, [0x86] = 0x2020, [0x87] = 0x2021, [0x88] = 0x02C6, [0x89] = 0x2030, [0x8A] = 0x0160, [0x8B] = 0x2039, [0x8C] = 0x0152, [0x8E] = 0x017D, [0x91] = 0x2018, [0x92] = 0x2019, [0x93] = 0x201C, [0x94] = 0x201D, [0x95] = 0x2022, [0x96] = 0x2013, [0x97] = 0x2014, [0x98] = 0x02DC, [0x99] = 0x2122, [0x9A] = 0x0161, [0x9B] = 0x203A, [0x9C] = 0x0153, [0x9E] = 0x017E, [0x9F] = 0x0178 } local function fix_null(codepoint) if codepoint == null then return 0xFFFD else return codepoint end end HtmlStates.data = function(parser) -- this is the default state local codepoint = parser.codepoint -- print("codepoint", parser.codepoint) if codepoint == less_than then -- start of tag return "tag_open" elseif codepoint == amperesand then -- we must save the current state -- what we will return to after entity parser.return_state = "data" return "character_reference" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) end return "data" end HtmlStates.tag_open = function(parser) -- parse tag contents local codepoint = parser.codepoint if codepoint == exclam then return "markup_declaration_open" elseif codepoint == solidus then return "end_tag_open" elseif codepoint == question then parser:start_token("comment",{data={}}) return "bogus_comment" elseif is_alpha(codepoint) then local data = { name = {}, attr = {}, current_attr_name = {}, current_attr_value = {}, self_closing = false } parser:start_token("start_tag", data) return parser:tokenize("tag_name") elseif codepoint == EOF then parser:emit_character(">") parser:emit_eof() else -- invalid tag -- emit "<" and reconsume current character as data parser:emit_character("<") return parser:tokenize("data") end end HtmlStates.character_reference = function(parser) -- parse HTML entities -- initialize temp buffer parser.temp_buffer = {"&"} local codepoint = parser.codepoint if is_alphanumeric(codepoint) then return parser:tokenize("named_character_reference") elseif codepoint == numbersign then table.insert(parser.temp_buffer, uchar(codepoint)) return "numeric_character_reference" else parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end HtmlStates.named_character_reference = function(parser) -- named entity parsing is pretty complicated -- https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state local codepoint = parser.codepoint -- test if the current entity name is included in the named entity list local search_table = {} -- first char in temp buffer is &, which we don't want to lookup in the search tree for i=2, #parser.temp_buffer do search_table[#search_table+1] = parser.temp_buffer[i] end if codepoint == semicolon then -- close named entity local entity = search_entity_tree(search_table) if entity and entity.char then parser:add_entity(entity.char) else -- if the current name doesn't correspond to any named entity, flush everything into text parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end return parser.return_state else local char = uchar(codepoint) -- try if the current entity name is in the named entity search tree table.insert(search_table, char) local entity = search_entity_tree(search_table) if entity then -- keep parsing name entity while we match a name table.insert(parser.temp_buffer, char) return "named_character_reference" else -- here this will be more complicated if #search_table > 1 then local token = parser.current_token if token.type == "start_tag" and (codepoint == equals or is_alphanumeric(codepoint)) then -- in attribute value, flush characters and retokenize parser:flush_temp_buffer() return parser:tokenize(parser.return_state) else -- try to get entity for characters preceding the current character table.remove(search_table) local newentity = search_entity_tree(search_table) if newentity and newentity.char then parser:add_entity(newentity.char) else -- we need to find if parts of the current substring match a named entity -- for example ¬it; -> ¬it; but ∉ -> ∉ local rest = {} -- loop over the table with characters, and try to find if it matches entity for i = #search_table, 1,-1 do local removed_char = table.remove(search_table) -- table.insert(rest, 1, removed_char) newentity = search_entity_tree(search_table) if newentity and newentity.char then parser:add_entity(newentity.char) parser.temp_buffer = rest break end end -- replace temporary buffer witch characters that followed the matched entity parser:flush_temp_buffer() end return parser:tokenize(parser.return_state) end else -- search table contains only the current character parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end end end HtmlStates.numeric_character_reference = function(parser) -- this variable will hold the number local codepoint = parser.codepoint parser.character_reference_code = 0 if codepoint == smallx or codepoint == bigx then -- hexadecimal entity table.insert(parser.temp_buffer, uchar(codepoint)) return "hexadecimal_character_reference_start" else -- try decimal entity return parser:tokenize("decimal_character_reference_start") end end HtmlStates.hexadecimal_character_reference_start = function(parser) local codepoint = parser.codepoint if is_hexadecimal(codepoint) then return parser:tokenize("hexadecimal_character_reference") else parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end HtmlStates.decimal_character_reference_start = function(parser) local codepoint = parser.codepoint if is_numeric(codepoint) then return parser:tokenize("decimal_character_reference") else parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end HtmlStates.decimal_character_reference = function(parser) local codepoint = parser.codepoint -- helper functions for easier working with the character_reference_code local function multiply(number) parser.character_reference_code = parser.character_reference_code * number end local function add(number) parser.character_reference_code = parser.character_reference_code + number end if is_numeric(codepoint) then multiply(10) add(codepoint - 0x30) elseif codepoint == semicolon then return "numeric_reference_end_state" else -- this adds current entity parser:tokenize("numeric_reference_end_state") -- now tokenize the current character return parser:tokenize(parser.return_state) end return "decimal_character_reference" end HtmlStates.hexadecimal_character_reference = function(parser) local codepoint = parser.codepoint -- helper functions for easier working with the character_reference_code local function multiply(number) parser.character_reference_code = parser.character_reference_code * number end local function add(number) parser.character_reference_code = parser.character_reference_code + number end if is_numeric(codepoint) then multiply(16) add(codepoint - 0x30) elseif is_upper_hex(codepoint) then multiply(16) add(codepoint - 0x37) elseif is_lower_hex(codepoint) then multiply(16) add(codepoint - 0x57) elseif codepoint == semicolon then return "numeric_reference_end_state" else -- this adds current entity parser:tokenize("numeric_reference_end_state") -- now tokenize the current character return parser:tokenize(parser.return_state) end return "hexadecimal_character_reference" end HtmlStates.numeric_reference_end_state = function(parser) -- in this state, we don't need to local character = parser.character_reference_code -- we need to clean invalid character codes if character == 0x00 or character > 0x10FFFF or is_surrogate(character) then character = 0xFFFD -- should we add special support for "noncharacter"? I think we can pass them to the output anyway elseif character_entity_replace_table[character] then character = character_entity_replace_table[character] end parser:add_entity(uchar(character)) return parser.return_state end HtmlStates.markup_declaration_open = function(parser) -- started by <! -- we now need to find the following text, to find if we started comment, doctype, or cdata local comment_pattern = "^%-%-" local doctype_pattern = "^[Dd][Oo][Cc][Tt][Yy][Pp][Ee]" local cdata_pattern = "^%[CDATA%[" local start_pos = parser.position local text = parser.body if text:match(comment_pattern, start_pos) then -- local _, newpos = text:find(comment_pattern, start_pos) -- we need to ignore next few characters parser.ignored_pos = start_pos + 1 parser:start_token("comment", {data = {}}) return "comment_start" elseif text:match(doctype_pattern, start_pos) then parser.ignored_pos = start_pos + 6 parser:start_token("doctype", {name = {}, data = {}, force_quirks = false}) return "doctype" elseif text:match(cdata_pattern, start_pos) then parser.ignored_pos = start_pos + 6 local current_element = parser:current_node() if current_element.xmlns == xmlns.HTML or not current_element.xmlns then -- we change CDATA simply to comments parser:start_token("comment", {data = {"[CDATA["}}) return "bogus_comment" else -- we are in XML mode, this happens for included SVG or MathML return "cdata_section" end else parser:start_token("comment", {data = {}}) return "bogus_comment" end -- local start, stop = string.find(parser.body, comment_pattern, parser.position) end HtmlStates.cdata_section = function(parser) local codepoint = parser.codepoint if codepoint == right_square then return "cdata_section_bracket" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "cdata_section" end end HtmlStates.cdata_section_bracket = function(parser) local codepoint = parser.codepoint if codepoint == right_square then return "cdata_section_end" else parser:emit_character("]") return parser:tokenize("cdata_section") end end HtmlStates.cdata_section_end = function(parser) local codepoint = parser.codepoint if codepoint == right_square then parser:emit_character("]") return "cdata_section_end" elseif codepoint == greater_than then return "data" else parser:emit_character("]") return parser:tokenize("cdata_section") end end HtmlStates.comment_start = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then return "comment_start_dash" elseif codepoint == greater_than then parser:emit() return "data" else return parser:tokenize("comment") end end HtmlStates.comment_start_dash = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then return "comment_end" elseif codepoint == greater_than then parser:emit() return data elseif codepoint == EOF then parser:emit() parser:emit_eof() else parser:append_token_data("data", "-") return parser:tokenize("comment") end end HtmlStates.comment = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == less_than then parser:append_token_data("data", uchar(codepoint)) return "comment_less_than" elseif codepoint == hyphen then return "comment_end_dash" elseif codepoint == EOF then parser:emit() parser:emit_eof() else parser:append_token_data("data", uchar(codepoint)) end return "comment" end HtmlStates.comment_less_than = function(parser) local codepoint = parser.codepoint if codepoint == exclam then parser:append_token_data("data", uchar(codepoint)) return "comment_less_than_bang" elseif codepoint == less_than then parser:append_token_data("data", uchar(codepoint)) return "comment_less_than" else return parser:tokenize("comment") end end HtmlStates.comment_less_than_bang = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then return "comment_less_than_bang_dash" else return parser:tokenize("comment") end end HtmlStates.comment_less_than_bang_dash = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then return "comment_less_than_bang_dash_dash" else return parser:tokenize("comment_end_dash") end end HtmlStates.comment_less_than_bang_dash_dash = function(parser) -- these comment states start to be ridiculous local codepoint = parser.codepoint if codepoint == greater_than or codepoint == EOF then return parser:tokenize("comment_end") else return parser:tokenize("comment_end") end end HtmlStates.comment_end_dash = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then return "comment_end" elseif codepoint == EOF then parser:emit() parser:emit_eof() else parser:append_token_data("data", uchar(codepoint)) return parser:tokenize("comment") end end HtmlStates.comment_end = function(parser) local codepoint = parser.codepoint if codepoint == greater_than then parser:emit() return "data" elseif codepoint == exclam then return "comment_end_bang" elseif codepoint == hyphen then parser:append_token_data("data", "-") return "comment_end" elseif codepoint == EOF then parser:emit() parser:emit_eof() else parser:append_token_data("data", "--") return parser:tokenize("comment") end end HtmlStates.comment_end_bang = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then parser:append_token_data("data", "--!") return "comment_end_dash" elseif codepoint == greater_than then parser:emit() return "data" elseif codepoint == EOF then parser:emit() parser:emit_eof() else parser:append_token_data("data", "--!") return parser:tokenize("comment") end end HtmlStates.end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then local data = { name = {} } parser:start_token("end_tag", data) return parser:tokenize("tag_name") elseif codepoint == greater_than then return "data" elseif codepoint == EOF then parser:discard_token() parser:emit_character("</") parser:emit_eof() else data = { data = {} } parser:start_token("comment", data) return parser:tokenize("bogus_comment") end end HtmlStates.bogus_comment = function(parser) -- started by <? local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == greater_than then parser:emit() return "data" elseif codepoint == EOF then parser:emit() parser:emit_eof() else parser:append_token_data("data", uchar(codepoint)) return "bogus_comment" end end local function doctype_eof(parser) parser:set_token_data("force_quirks", true) parser:emit() parser:emit_eof() end HtmlStates.doctype = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_doctype_name" elseif codepoint == greater_than then return parser:tokenize("before_doctype_name") elseif codepoint == EOF then doctype_eof(parser) else return parser:tokenize("before_doctype_name") end end HtmlStates.before_doctype_name = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if is_space(codepoint) then return "before_doctype_name" elseif codepoint == greater_than then parser:set_token_data("force_quirks", true) parser:emit() return "data" elseif codepoint == EOF then doctype_eof(parser) elseif is_upper_alpha(codepoint) then -- add lowercase name parser:append_token_data("name", uchar(codepoint + 0x20)) return "doctype_name" else parser:append_token_data("name", uchar(codepoint)) return "doctype_name" end end HtmlStates.doctype_name = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if is_space(codepoint) then return "after_doctype_name" elseif codepoint == greater_than then parser:emit() return "data" elseif codepoint == EOF then doctype_eof(parser) elseif is_upper_alpha(codepoint) then -- add lowercase name parser:append_token_data("name", uchar(codepoint + 0x20)) return "doctype_name" else parser:append_token_data("name", uchar(codepoint)) return "doctype_name" end end HtmlStates.after_doctype_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "after_doctype_name" elseif codepoint == greater_than then parser:emit() return "data" elseif codepoint == EOF then doctype_eof(parser) else parser:append_token_data("data", uchar(codepoint)) -- there are lot of complicated rules how to consume doctype, -- but I think that for our purpose they aren't interesting. -- so everything until EOF or > is consumed as token.data return "consume_doctype_data" end end HtmlStates.consume_doctype_data = function(parser) -- this state just reads everything inside doctype as data local codepoint = parser.codepoint if codepoint == greater_than then parser:emit() return "data" elseif codepoint == EOF then doctype_eof(parser) else parser:append_token_data("data", uchar(codepoint)) return "consume_doctype_data" end end HtmlStates.tag_name = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" elseif is_upper_alpha(codepoint) then local lower = string.lower(uchar(codepoint)) parser:append_token_data("name", lower) elseif codepoint==EOF then parser:emit() parser:emit_eof() else local char = uchar(codepoint) parser:append_token_data("name", char) end return "tag_name" end HtmlStates.self_closing_tag = function(parser) local codepoint = parser.codepoint if codepoint == greater_than then parser.current_token.self_closing = true parser:emit() return "data" else return parser:tokenize("before_attribute_name") end end HtmlStates.before_attribute_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then -- ignore spacing return "before_attribute_name" elseif codepoint == solidus or codepoint == greater_than then -- reconsume in after_attribute_name return parser:tokenize("after_attribute_name") elseif codepoint == equals then -- ToDo: handle https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name else -- start new attribute parser:start_attribute() return parser:tokenize("attribute_name") end end HtmlStates.attribute_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) or codepoint == solidus or codepoint == greater_than then return parser:tokenize("after_attribute_name") elseif codepoint == equals then return "before_attribute_value" elseif is_upper_alpha(codepoint) then -- lowercase attribute names local lower = string.lower(uchar(codepoint)) parser:append_token_data("current_attr_name", lower) return "attribute_name" else parser:append_token_data("current_attr_name", uchar(codepoint)) return "attribute_name" end end HtmlStates.after_attribute_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "after_attribute_name" elseif codepoint == equals then return "before_attribute_value" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" else parser:start_attribute() return parser:tokenize("attribute_name") end end HtmlStates.before_attribute_value = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_attribute_value" elseif codepoint == quoting then return "attribute_value_quoting" elseif codepoint == apostrophe then return "attribute_value_apostrophe" elseif codepoint == greater_than then parser:emit() return "data" else return parser:tokenize("attribute_value_unquoted") end end HtmlStates.attribute_value_quoting = function(parser) local codepoint = parser.codepoint if codepoint == quoting then return "after_attribute_value_quoting" elseif codepoint == amperesand then parser.return_state = "attribute_value_quoting" return "character_reference" else parser:append_token_data("current_attr_value", uchar(codepoint)) return "attribute_value_quoting" end end HtmlStates.attribute_value_apostrophe = function(parser) local codepoint = parser.codepoint if codepoint == apostrophe then return "after_attribute_value_quoting" elseif codepoint == amperesand then parser.return_state = "attribute_value_apostrophe" return "character_reference" else parser:append_token_data("current_attr_value", uchar(codepoint)) return "attribute_value_apostrophe" end end HtmlStates.attribute_value_unquoted = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_attribute_name" elseif codepoint == amperesand then parser.return_state = "attribute_value_unquoted" return "character_reference" elseif codepoint == greater_than then parser:emit() return "data" else parser:append_token_data("current_attr_value", uchar(codepoint)) return "attribute_value_unquoted" end end HtmlStates.after_attribute_value_quoting = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" else return parser:tokenize("before_attribute_name") end end HtmlStates.rcdata = function(parser) -- this is the default state local codepoint = parser.codepoint -- print("codepoint", parser.codepoint) codepoint = fix_null(codepoint) if codepoint == less_than then -- start of tag return "rcdata_less_than" elseif codepoint == amperesand then -- we must save the current state -- what we will return to after entity parser.return_state = "rcdata" return "character_reference" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) end return "rcdata" end local function discard_rcdata_end_tag(parser, text) parser:discard_token() parser:emit_character(text) end HtmlStates.rcdata_less_than = function(parser) local codepoint = parser.codepoint if codepoint == solidus then return "rcdata_end_tag_open" else discard_rcdata_end_tag(parser, "<") return parser:tokenize("rcdata") end end HtmlStates.rcdata_end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then parser:start_token("end_tag", {name={}}) parser.temp_buffer = {} return parser:tokenize("rcdata_end_tag_name") else discard_rcdata_end_tag(parser, "</") return parser:tokenize("rcdata") end end HtmlStates.rcdata_end_tag_name = function(parser) -- we need to find name of the currently opened tag local parent = parser:get_parent() or {} local opened_tag = parent.tag local current_tag = table.concat(parser.current_token.name or {}) local codepoint = parser.codepoint if is_upper_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint + 0x20)) -- insert current char to temp buffer table.insert(parser.temp_buffer, uchar(codepoint)) return "rcdata_end_tag_name" elseif is_lower_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "rcdata_end_tag_name" elseif opened_tag == current_tag then if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" end else discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer)) parser.temp_buffer = {} return parser:tokenize("rcdata") end end HtmlStates.rawtext = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == less_than then return "rawtext_less_than" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "rawtext" end end HtmlStates.rawtext_less_than = function(parser) local codepoint = parser.codepoint if codepoint == solidus then return "rawtext_end_tag_open" else parser:emit_character("<") return parser:tokenize("rawtext") end end HtmlStates.rawtext_end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then parser:start_token("end_tag", {name={}}) parser.temp_buffer = {} return parser:tokenize("rawtext_end_tag_name") else parser:emit_character("</") return parser:tokenize("rawtext") end end HtmlStates.rawtext_end_tag_name = function(parser) -- we need to find name of the currently opened tag local parent = parser:get_parent() or {} local opened_tag = parent.tag local current_tag = table.concat(parser.current_token.name or {}) local codepoint = parser.codepoint if is_upper_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint + 0x20)) table.insert(parser.temp_buffer, uchar(codepoint)) return "rawtext_end_tag_name" elseif is_lower_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "rawtext_end_tag_name" elseif opened_tag == current_tag then if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" end else discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer)) parser.temp_buffer = {} return parser:tokenize("rawtext") end end HtmlStates.script_data = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == less_than then return "script_data_less_than" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data" end end HtmlStates.script_data_less_than = function(parser) local codepoint = parser.codepoint if codepoint == solidus then parser.temp_buffer = {} return "script_data_end_tag_open" elseif codepoint == exclam then parser:emit_character("<!") return "script_data_escape_start" else parser:emit_character("<") return parser:tokenize("script_data") end end HtmlStates.script_data_end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then parser:start_token("end_tag", {name={}}) return parser:tokenize("script_data_end_tag_name") else parser:emit_character("</") return parser:tokenize("script_data") end end HtmlStates.script_data_end_tag_name = function(parser) -- we need to find name of the currently opened tag local parent = parser:get_parent() or {} local opened_tag = parent.tag local current_tag = table.concat(parser.current_token.name or {}) local codepoint = parser.codepoint if is_upper_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint + 0x20)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_end_tag_name" elseif is_lower_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_end_tag_name" elseif opened_tag == current_tag then if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" end else discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer)) parser.temp_buffer = {} return parser:tokenize("script_data") end end HtmlStates.script_data_escape_start = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then parser:emit_character("-") return "script_data_escape_start_dash" else parser:tokenize("script_data") end end HtmlStates.script_data_escape_start_dash = function(parser) local codepoint = parser.codepoint if codepoint == hyphen then parser:emit_character("-") return "script_data_escaped_dash_dash" else parser:tokenize("script_data") end end HtmlStates.script_data_escaped = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == hyphen then parser:emit_character("-") return "script_data_escaped_dash" elseif codepoint == less_than then return "script_data_escaped_less_than_sign" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_escaped" end end HtmlStates.script_data_escaped_dash = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == hyphen then parser:emit_character("-") return "script_data_escaped_dash_dash" elseif codepoint == less_than then return "script_data_escaped_less_than_sign" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_escaped" end end HtmlStates.script_data_escaped_dash_dash = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == hyphen then parser:emit_character("-") return "script_data_escaped_dash_dash" elseif codepoint == less_than then return "script_data_escaped_less_than_sign" elseif codepoint == greater_than then parser:emit_character(">") return "script_data" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_escaped" end end HtmlStates.script_data_escaped_less_than_sign = function(parser) local codepoint = parser.codepoint if codepoint == solidus then parser.temp_buffer = {} return "script_data_escaped_end_tag_open" elseif is_alpha(codepoint) then parser.temp_buffer = {} parser:emit_character("<") return parser:tokenize("script_data_double_escape_start") else parser:emit_character("<") return parser:tokenize("script_data_escaped") end end HtmlStates.script_data_escaped_end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then parser:start_token("end_tag", {name={}}) return parser:tokenize("script_data_escaped_end_tag_name") else parser:emit_character("</") return parser:tokenize("script_data_escaped") end end HtmlStates.script_data_escaped_end_tag_name = function(parser) -- we need to find name of the currently opened tag local parent = parser:get_parent() or {} local opened_tag = parent.tag local current_tag = table.concat(parser.current_token.name or {}) local codepoint = parser.codepoint if is_upper_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint + 0x20)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_escaped_end_tag_name" elseif is_lower_alpha(codepoint) then parser:append_token_data("name", uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_escaped_end_tag_name" elseif opened_tag == current_tag then if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" end else discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer)) parser.temp_buffer = {} return parser:tokenize("script_data_escaped") end end HtmlStates.script_data_double_escape_start = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) or codepoint == solidus or codepoint == greater_than then local current_tag = table.concat(parser.current_token.name or {}) parser:emit_character(uchar(codepoint)) if current_tag == "script" then return "script_data_double_escaped" else return "script_data_escaped" end elseif is_upper_alpha(codepoint) then parser:emit_character(uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint) + 0x20) return "script_data_double_escape_start" elseif is_lower_alpha(codepoint) then parser:emit_character(uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_double_escape_start" else return parser:tokenize("script_data_escaped") end end HtmlStates.script_data_double_escaped = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == hyphen then parser:emit_character("-") return "script_data_double_escaped_dash" elseif codepoint == less_than then parser:emit_character("<") return "script_data_double_escaped_less_than_sign" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_double_escaped" end end HtmlStates.script_data_double_escaped_dash = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if codepoint == hyphen then parser:emit_character("-") return "script_data_double_escaped_dash" elseif codepoint == less_than then parser:emit_character("<") return "script_data_double_escaped_less_than_sign" elseif codepoint == greater_than then parser:emit_character(">") return "script_data" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_double_escaped" end end HtmlStates.script_data_double_escaped_less_than_sign = function(parser) local codepoint = parser.codepoint if codepoint == solidus then parser:emit("/") return "script_data_double_escape_end" else return parser:tokenize("script_data_double_escaped") end end HtmlStates.script_data_double_escape_end = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) or codepoint == solidus or codepoint == greater_than then local current_tag = table.concat(parser.current_token.name or {}) parser:emit_character(uchar(codepoint)) if current_tag == "script" then return "script_data_escaped" else return "script_data_double_escaped" end elseif is_upper_alpha(codepoint) then parser:emit_character(uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint) + 0x20) return "script_data_double_escape_start" elseif is_lower_alpha(codepoint) then parser:emit_character(uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_double_escape_start" else return parser:tokenize("script_data_double_escaped") end end -- formatting elements needs special treatment local formatting_element_names ={ a = true, b = true, big = true, code = true, em = true, font = true, i = true, nobr = true, s = true, small = true, strike = true, strong = true, tt = true, u = true } local function is_formatting_element(name) return formatting_element_names[name] end local function hash_from_array(tbl) local t = {} for _, v in ipairs(tbl) do t[v] = true end return t end local special_elements_list = hash_from_array {"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main", "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", "section", "select", "source", "style", "summary", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp", "mi","mo","mn","ms","mtext", "annotation-xml","foreignObject","desc", "title" } local function is_special(name) return special_elements_list[name] end -- these lists are used in HtmlParser:generate_implied_endtags() local implied_endtags = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, rb = true, rp = true, rd = true, trc = true} local implied_endtags_thoroughly = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, rb = true, rp = true, rd = true, trc = true, caption = true, colgroup = true, tbody = true, td = true, tfoot = true, th = true, thead = true, tr = true } -- find if unfinished tags list contain a tag -- it fails if any element from element_list is matched before that tag local function is_in_scope(parser, target, element_list) for i = #parser.unfinished, 1, -1 do local node = parser.unfinished[i] local tag = node.tag if tag == target then return true elseif element_list[tag] then return false end end return false end local particular_scope_elements = { applet = true, caption = true, html = true, table = true, td = true, th = true, marquee = true, object = true, template = true, mi = true, mo = true, mn = true, ms = true, mtext = true, ["annotation-xml"] = true, foreignObject = true, desc = true, title = true, } local function is_in_particular_scope(parser, target) return is_in_scope(parser, target, particular_scope_elements) end -- derived scope lists -- -- list_item scope local list_item_scope_elements = {ol = true, ul = true} for k,v in pairs(particular_scope_elements) do list_item_scope_elements[k] = v end local function is_in_list_item_scope(parser, target) return is_in_scope(parser, target, list_item_scope_elements) end -- button scope local button_scope_elements = {button = true} for k,v in pairs(particular_scope_elements) do button_scope_elements[k] = v end local function is_in_button_scope(parser, target) return is_in_scope(parser, target, button_scope_elements) end -- table scope local table_scope_elements = {html = true, table = true, template = true} local function is_in_table_scope(parser, target) return is_in_scope(parser, target, table_scope_elements) end -- select scope local function is_in_select_scope(parser, target) -- this scope is specific, because it supports all tags except two for i = #parser.unfinished, 1, -1 do local node = parser.unfinished[i] local tag = node.tag if tag == target then return true elseif tag == "optgroup" or tag == "option" then -- only these two tags are supported else return false end end return false end -- List of active formatting elements -- https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements -- we don't implement it yet, but maybe in the future. local HtmlTreeStates = {} --- @type HtmlParser local HtmlParser = {} --- Initialize the HTML Object ---@param body string HTML to be parsed ---@return table initialized object function HtmlParser:init(body) local o ={} setmetatable(o, self) self.__index = self o.body = self:normalize_newlines(body) -- HTML string o.position = 0 -- position in the parsed string o.unfinished = {} -- insert Root node into the list of opened elements o.Document = Root:init() o.default_state = "data" -- default state machine state o.state = o.default_state -- working state of the machine o.return_state = o.default_state -- special state set by entities parsing o.temp_buffer = {} -- keep temporary data o.current_token = {type="start"} -- currently processed token o.insertion_mode = "initial" -- tree construction state o.head_pointer = nil -- pointer to the Head element o.form_pointer = nil o.active_formatting = {} -- list of active formatting elements o.scripting_flag = false -- we will not support scripting return o end function HtmlParser:normalize_newlines(body) -- we must normalize newlines return body:gsub("\r\n", "\n"):gsub("\r", "\n") end -- declare void elements local self_closing_tags_list = {"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"} local self_closing_tags = {} for _,v in ipairs(self_closing_tags_list) do self_closing_tags[v] = true end --- Execute the HTML parser --- @return table Root node of the HTML DOM function HtmlParser:parse() -- we assume utf8 input, you must convert it yourself if the source is -- in a different encoding. for example using luaxml-encodings library self.text = {} self.state = self.default_state -- this should enable us to pass over some characters that we want to ignore -- for example scripts, css, etc. self.ignored_pos = -1 for pos, ucode in utf8.codes(self.body) do -- save buffer info and require the tokenize function if pos > self.ignored_pos then self.position = pos self.codepoint = ucode self.character = uchar(ucode) self.state = self:tokenize(self.state) or self.state -- if tokenizer don't return new state, assume that it continues in the current state end end return self:finish() end function HtmlParser:tokenize(state) local state = state or self.state local ucode = self.codepoint local text = self.text self.last_position = self.position self.element_state = false -- execute state machine object and return new state local fn = HtmlStates[state] or function(parser) return self.default_state end local newstate = fn(self) -- this should enable changing state from elements that needs special treatment, like <script> or <style> if self.element_state then return self.element_state end -- print("newstate", newstate, state, uchar(ucode or 32)) return newstate end function HtmlParser:start_token(typ, data) -- emit the previous token -- self:emit() data.type = typ self.current_token = data end function HtmlParser:discard_token() self.current_token = {type="empty"} end function HtmlParser:append_token_data(name, data) -- append data to the current token local token = self.current_token or {} if token[name] and type(token[name]) == "table" then table.insert(token[name], data) end end function HtmlParser:set_token_data(name, data) local token = self.current_token or {} token[name] = data end function HtmlParser:flush_temp_buffer() -- write stuff from the temp buffer back to the document local token = self.current_token if token.type == "start_tag" then -- in start tag, entities can be only in attribute value for _, char in ipairs(self.temp_buffer) do table.insert(token.current_attr_value, char) end elseif self.return_state == "data" then -- handle entities in text for _, char in ipairs(self.temp_buffer) do self:start_token("character", {char=char}) self:emit() end end self.temp_buffer = {} end function HtmlParser:add_entity(char) local token = self.current_token if token.type == "start_tag" then table.insert(token.current_attr_value, char) else self:start_token("character", {char=char}) self:emit() end self.temp_buffer = {} end function HtmlParser:emit(token) -- state machine functions should use this function to emit tokens local token = token or self.current_token -- print("Emit", token.type) local token_type = token.type if token_type == "character" then table.insert(self.text, token.char) elseif token_type == "doctype" then self:add_text() self:add_doctype() elseif token_type == "start_tag" then self:add_text() -- self:start_attribute() self:reset_insertion_mode() self:start_tag() -- print("Emit start tag", table.concat(token.name)) -- save last attribute elseif token_type == "end_tag" then self:add_text() self:end_tag() -- print("Emit end tag", table.concat(token.name)) elseif token_type == "comment" then self:add_text() self:add_comment() -- self:start_attribute() elseif token_type == "empty" then end -- self.current_token = {type="empty"} end function HtmlParser:emit_character(text) self:start_token("character", {char=text}) self:emit() end function HtmlParser:emit_eof() self:start_token("end_of_file", {}) self:emit() end function HtmlParser:get_parent() -- return parent element return self.unfinished[#self.unfinished] or self.Document end function HtmlParser:close_element() -- return parent element and remove it from the unfinished list return table.remove(self.unfinished) end function HtmlParser:add_text(text) -- process current text node local text = text if not text then text = self.text end if type(text) == "table" then if #text > 0 then text = table.concat(text) end end if type(text) == "string" and text~="" then local parent = self:get_parent() local node = Text:init(text, parent) parent:add_child(node) end self.text = {} end function HtmlParser:start_attribute() local token = self.current_token or {} if token.type == "start_tag" then local attr_name = table.concat(token.current_attr_name) local attr_value = table.concat(token.current_attr_value) or "" if attr_name ~= "" then -- token.attr[attr_name] = attr_value table.insert(token.attr, {name = attr_name, value = attr_value}) -- print("saving attribute", attr_name, attr_value) end self:set_token_data("current_attr_name", {}) self:set_token_data("current_attr_value", {}) end end function HtmlParser:set_xmlns(node, parent) -- handle xmlns local in_attr = false -- try to find xmlns in node's attributes first for _, attr in ipairs(node.attr) do if attr.name == "xmlns" then node.xmlns = attr.value in_attr = true break end end if not in_attr then -- if we cannot find xmlns attribute, then use -- xmlns from the parent element, or the default xmlns local parent = self:get_parent() node.xmlns = parent.xmlns or xmlns.HTML end end function HtmlParser:pop_element() -- close the current element and add it to the DOM local el = self:close_element() local parent = self:get_parent() parent:add_child(el) return el end local close_p_at_start = hash_from_array {"address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul", "pre", "listing", "form", "table", "xmp", "hr"} local close_headers = hash_from_array {"h1", "h2", "h3", "h4", "h5", "h6"} local body_modes = hash_from_array {"in_body", "in_cell", "in_row", "in_select", "in_table", "in_table_body", "in_frameset"} local list_items = hash_from_array {"li", "dt", "dd"} local close_address_at_end = hash_from_array{"address", "article", "aside", "blockquote", "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", "search", "section", "summary", "ul", "form"} function HtmlParser:close_unfinished(name) -- close all unfinished elements until the element with the given name is found for i = #self.unfinished, 1, -1 do local el = self:pop_element() if el.tag == name then break end end end function HtmlParser:close_paragraph() -- close currently open <p> elements self:close_unfinished("p") end function HtmlParser:current_element_name() -- return name of the current element return self:get_parent().tag end local not_specials = hash_from_array { "address", "div", "p"} local function handle_list_item(self, name) -- we handle li, dt and dd. dt and dd should close each other, li closes only itself local names = {dt = true, dd = true} if name == "li" then names = {li=true} end for i = #self.unfinished, 1, -1 do local current = self.unfinished[i] local current_tag = current.tag if names[current_tag] then self:generate_implied_endtags(nil, {current.tag}) for j = #self.unfinished, i, -1 do self:pop_element() end break elseif is_special(current_tag) and not not_specials[name] then break end end end local close_paragraph = function(self) if is_in_button_scope(self, "p") then self:close_paragraph() end end function HtmlParser:handle_insertion_mode(token) -- simple handling of https://html.spec.whatwg.org/multipage/parsing.html#tree-construction -- we don't support most rules, just the most important for avoiding mismatched tags if body_modes[self.insertion_mode] then if token.type == "start_tag" then local name = table.concat(token.name) if close_p_at_start[name] then close_paragraph(self) end if close_headers[name] then close_paragraph(self) -- close current element if it is already header if close_headers[self:current_element_name()] then self:pop_element() end elseif name == "pre" or name == "listing" then -- we should ignore next "\n" char token elseif name == "image" then -- image tag is an error, change to <img> token.name = {"img"} elseif list_items[name] then handle_list_item(self, name) close_paragraph(self) end elseif token.type == "end_tag" then local name = table.concat(token.name) if close_address_at_end[name] then if is_in_scope(self, name, {}) then self:generate_implied_endtags() self:close_unfinished(name) return false else token.type = "ignore" end elseif name == "p" then if not is_in_button_scope(self, "p") then local parent = self:get_parent() local node = Element:init("p", parent) table.insert(self.unfinished, node) end -- use self:close_paragraph() instead of close_paragraph() because we don't need to check scope at this point self:close_paragraph() elseif name == "br" then token.type = "start_tag" elseif close_headers[name] then local header_in_scope = false -- detect, if there are any open h1-h6 tag and close it for el, _ in pairs(close_headers) do if is_in_scope(self, el, {}) then header_in_scope = el break end end if not header_in_scope then token.type = "ignore" else self:close_unfinished(header_in_scope) end end end end return true end local rawtext_elements = hash_from_array {"style", "textarea", "xmp"} function HtmlParser:start_tag() local token = self.current_token self:handle_insertion_mode(token) if token.type == "start_tag" then -- close all currently opened attributes self:start_attribute() -- initiate Element object, pass attributes and info about self_closing local name = table.concat(token.name) local parent = self:get_parent() local node = Element:init(name, parent) node.attr = token.attr node.self_closing = token.self_closing self:set_xmlns(node) -- in this handler we should close <p> or <li> elements without explicit closing tags if token.self_closing -- <img /> or self_closing_tags[name] -- void elements then parent:add_child(node, node.tag) else -- add to the unfinished list table.insert(self.unfinished, node) end if name == "title" then self.element_state = "rcdata" elseif rawtext_elements[name] then self.element_state = "rawtext" elseif name == "script" then self.element_state = "script_data" end end end function HtmlParser:end_tag() -- close current opened element local token = self.current_token local should_pop = self:handle_insertion_mode(token) if token.type == "end_tag" then if #self.unfinished==0 then return nil end -- we shouldn't close elements if handle_insertion_mode() already closed them if should_pop then -- close the current element only if the token is in the current scope if is_in_scope(self, table.concat(token.name), {}) then self:pop_element() end end end end function HtmlParser:add_comment() local token = self.current_token if token.type == "comment" then self:start_attribute() local parent = self:get_parent() local text = table.concat(token.data) local node = Comment:init(text, parent) parent:add_child(node) end end function HtmlParser:add_doctype() local token = self.current_token if token.type == "doctype" then self:start_attribute() local parent = self:get_parent() local name = table.concat(token.name) local node = Doctype:init(name, parent) if #token.data > 0 then node:add_data(table.concat(token.data)) end parent:add_child(node) end end function HtmlParser:switch_insertion(name) self.insertion_mode = name end function HtmlParser:current_node() return self:get_parent() end function HtmlParser:adjusted_current_node() -- we don't support this feature yet -- https://html.spec.whatwg.org/multipage/parsing.html#adjusted-current-node return self:current_node() end local simple_modes = { body = "in_body", td = "in_cell", th = "in_cell", tr = "in_row", tbody = "in_table_body", thead = "in_table_body", tfoot = "in_table_body", caption = "in_caption", colgroup = "in_column_group", table = "in_table", template = "current_template_insertion_mode", frameset = "in_frameset" } function HtmlParser:reset_insertion_mode() -- https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately local last = false for position = #self.unfinished, 1, -1 do local node = self.unfinished[position] if position == 1 then last = true end local name = node.tag -- switch to insertion mode based on the current element name -- there is lot of other cases, but we support only basic ones -- we can support other insertion modes in the future if name == "head" and last == true then self:switch_insertion("in_head") return elseif name == "html" then if not self.head_pointer then self:switch_insertion("before_head") return else self:switch_insertion("after_head") return end elseif name == "select" then if not last then for x = position -1, 1, -1 do if x == 1 then break end local ancestor = self.unfinished[x] local ancestor_name = ancestor.tag if ancestor_name == "template" then break elseif ancestor_name == "table" then self:switch_insertion("in_select_in_table") return end end end self:switch_insertion("in_select") return elseif simple_modes[name] then self:switch_insertion(simple_modes[name]) return elseif last == true then self:switch_insertion("in_body") return end end -- by default use in_body self:switch_insertion("in_body") end -- https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags function HtmlParser:generate_implied_endtags(included, ignored) local included = included or implied_endtags -- parser can pass list of elements that should be removed from the "included" list local ignored = ignored or {} for _, name in ipairs(ignored) do included[name] = nil end local current = self:current_node() or {} -- keep removing elements while they are in the "included" list if included[current.tag] then self:pop_element() self:generate_implied_endtags(included, ignored) end end function HtmlParser:finish() -- tokenize without any real character self.codepoint = EOF self:tokenize(self.state) -- self:emit() self:add_text() -- close all unfinished elements if #self.unfinished == 0 then -- add implicit html tag self:start_tag("html") end while #self.unfinished > 0 do self:pop_element() end -- return root element return self.Document -- self:close_element() end -- M.Text = Text M.Element = Element M.HtmlParser = HtmlParser M.HtmlStates = HtmlStates -- table with functions for particular parser states M.self_closing_tags = self_closing_tags -- list of void elements M.search_entity_tree = search_entity_tree M.is_in_particular_scope = is_in_particular_scope M.is_in_list_item_scope = is_in_list_item_scope M.is_in_button_scope = is_in_button_scope M.is_in_table_scope = is_in_table_scope M.is_in_select_scope = is_in_select_scope return M