#!/usr/bin/env texlua
-- extractbb-lua
-- https://github.com/gucci-on-fleek/extractbb
-- SPDX-License-Identifier: MPL-2.0+
-- SPDX-FileCopyrightText: 2024--2025 Max Chernoff
--
-- Inclusion Methods
-- =================
--
-- This script can use two different methods to extract bounding boxes from
-- images: the "img" module and the "pdfe" module. The "img" module will be
-- automatically selected in most cases and supports all image types that are
-- supported by the original "extractbb" program. If and only if the "img"
-- module fails to load, the "pdfe" module will be used as a fallback. However,
-- the "pdfe" module only supports PDF files. Both modules are built in to the
-- LuaTeX binaries, however due to some technical issues, the "img" module may
-- fail to load on some more exotic platforms.
--
--
-- Compatibility
-- =============
--
-- Based off of my testing, this Lua script is 100% compatible with the original
-- C-based "extractbb" program, with the following exceptions:
--
--   * When running in "img" mode, the PDF version is always reported as "1.5".
--
--   * When running in "img" mode, if the requested bounding box is not found,
--     the script will fallback to the Crop box or the Media box, instead of
--     following the original fallback order. (In practice, almost all PDFs set
--     all their bounding boxes equal to each other, and even if the boxes are
--     set to different values, the script will still return the requested box,
--     provided that it is set in the PDF.)
--
--   * When running in "pdfe" mode, only PDF files are supported.
--
-- All of these issues are very unlikely to affect any real-world documents.
--
--
-- Security
-- ========
--
-- This script is designed to be safely ran from restricted shell escape. A few
-- security features:
--
--   * The majority of this script runs inside a sandboxed Lua environment,
--     which only exposes a very restricted set of functions.
--
--   * All file-related functions available inside the sandbox first check with
--     kpathsea to ensure that the file is allowed to be opened.
--
--   * In the event of any errors, the script immediately exits.
--
--   * This script does not run (fork/exec) any external programs.
--
--   * This script is written entirely in Lua, so overflow/use-after-free
--     vulnerabilities are not possible.
--
-- Some potential security concerns:
--
--   * This script has not been audited or reviewed by anyone other than myself.
--
--   * The underlying LuaTeX modules may themselves have security
--     vulnerabilities, which would be inherited by this script.


----------------------
--- Initialization ---
----------------------

-- Pre-sandbox variables/constants
local show_errors = true
local SOURCE_DATE_EPOCH = tonumber(os.getenv("SOURCE_DATE_EPOCH"))
local version = "extractbb.lua v1.1.0 (2025-02-11)" --%%version %%dashdate

-- Required for any kpathsea calls to work.
kpse.set_program_name("texlua", "extractbb")

-- Required to use the "img" module from texlua, but only works for LuaTeX
-- versions >= 1.21.0.
if not (status.development_id >= 7661) then
    error("LuaTeX version is too old, cannot proceed.")
end
texconfig.texlua_img = true

-- We need to set \outputmode to PDF to be able to use most of the "img" module
-- functions, but to set \outputmode, we need to initialize the TeX interpreter.
tex.initialize()
_G.tex = package.loaded.tex
tex.enableprimitives("", tex.extraprimitives())
tex.outputmode = 1
tex.interactionmode = 0

-- "pdf" module
_G.pdf = package.loaded.pdf
pdf.setignoreunknownimages(1)
pdf.setmajorversion(2)
pdf.setminorversion(0)


------------------
--- Sandboxing ---
------------------

-- Prepare the sandbox for the rest of the script.
local env = {
    arg      = arg,
    io       = { stdout = io.stdout, },
    ipairs   = ipairs,
    math     = math,
    os       = { date = os.date, exit = os.exit, },
    pairs    = pairs,
    pdfe     = pdfe,
    print    = print,
    select   = select,
    table    = table,
    tonumber = tonumber,
    type     = type,
}

do
    -- Saved global functions
    local debug_traceback  = debug.traceback
    local find_file        = kpse.find_file
    local img_scan         = img.scan
    local io_open          = io.open
    local io_stderr        = io.stderr
    local kpse_in_name_ok  = kpse.in_name_ok
    local kpse_out_name_ok = kpse.out_name_ok
    local kpse_var_value   = kpse.var_value
    local lfs_attributes   = lfs.attributes
    local os_exit          = os.exit
    local os_setenv        = os.setenv
    local pdfe_open        = pdfe.open
    local select           = select
    local tostring         = tostring

    -- Error messages
    local function error(...)
        if show_errors then
            -- Header
            io_stderr:write("! extractbb ERROR: ")

            -- Message
            for i = 1, select("#", ...) do
                io_stderr:write(tostring(select(i, ...)), " ")
            end

            -- Traceback
            io_stderr:write("\n", "\n")
            io_stderr:write(debug_traceback(nil, 2), "\n")
        end

        -- Flush and exit
        io_stderr:flush()
        os_exit(1)
    end

    env.error = error

    -- Make sure that "openin_any" is at least "restricted", and that
    -- "openout_any" is at least "paranoid".
    local initial_openin  = kpse_var_value("openin_any")
    local initial_openout = kpse_var_value("openout_any")

    if (initial_openin ~= "r") or (initial_openout ~= "p") then
        os_setenv("openin_any",  "r")
    end

    if (initial_openout ~= "p") then
        os_setenv("openout_any", "p")
    end

    -- Check the input paths.
    local function resolve_input_name(file_name)
        local file_path = find_file(file_name, "graphic/figure", true)
        if not file_path then
            error("Cannot find input file:", file_name)
        end

        local allowed = kpse_in_name_ok(file_path)
        if not allowed then
            error("Input file is not allowed:", file_path)
        end

        local mode = lfs_attributes(file_path, "mode")
        if mode ~= "file" then
            error("Input file is not a regular file:", file_path)
        end

        return file_path
    end

    -- Check the output paths.
    local function resolve_output_name(file_name)
        local allowed = kpse_out_name_ok(file_name)
        if not allowed then
            error("Output file is not allowed:", file_name)
        end

        local name, extension = file_name:match("(.+)%.([^.]-)$")

        if (not name) or (not extension) or
           (name == "") or (extension == "")
        then
            error("Output file has no extension:", file_name)
        end

        if (extension ~= "xbb") and (extension ~= "bb") then
            error("Output file has an invalid extension:", file_name)
        end

        -- We shouldn't allow files with weird characters in their names.
        if name:match("[%c%%\t\r\n><*|]") then
            error("Output file has an invalid name:", file_name)
        end

        return file_name
    end

    -- Opens a file.
    function env.open_file(file_name, read_write, binary_text)
        local file_path, mode
        if read_write == "read" then
            file_path = resolve_input_name(file_name)
            mode = "r"
        elseif read_write == "write" then
            file_path = resolve_output_name(file_name)
            mode = "w"
        else
            error("Invalid read/write mode:", read_write)
        end

        if binary_text == "binary" then
            mode = mode .. "b"
        elseif binary_text == "text" then
            mode = mode .. ""
        else
            error("Invalid binary/text mode:", binary_text)
        end

        local file, message = io_open(file_path, mode)

        if not file then
            error("Cannot open file:", file_path, message)
        end

        return file
    end

    -- Open an PDF file.
    function env.pdfe.open(file_name)
        local file_path = resolve_input_name(file_name)
        return pdfe_open(file_path)
    end

    -- Open an image file.
    function env.open_image(file_name, page, box)
        local file_path = resolve_input_name(file_name)
        return img_scan {
            filename = file_path,
            filepath = file_path,
            page     = page,
            pagebox  = box,
        }
    end

    if not img_scan then
        env.open_image = false
    end
end

-- Prevent trying to change the environment.
local function bad_index(...)
    env.error("Attempt to access an undefined index:", select(2, ...))
end

setmetatable(env, {
    __index     = bad_index,
    __metatable = false,
    __newindex  = bad_index,
})

-- Set the environment.
_ENV = env


-----------------------------------
--- Post-Sandbox Initialization ---
-----------------------------------

-- Constants
local BP_TO_SP    = 65781.76
local IN_TO_BP    = 72
local DATE_FORMAT = "%a %b %d %H:%M:%S %Y" -- "%c"

-- Save often-used globals for a slight speed boost.
local floor            = math.floor
local insert           = table.insert
local remove           = table.remove
local script_arguments = arg
local unpack           = table.unpack

-- General-purpose functions
local function round(number)
    return floor(number +0.5)
end


-------------------------
--- Argument Handling ---
-------------------------

-- Define the argument handling functions.
local process_arguments = {}

-- > Specify a PDF pagebox for bounding box
-- > pagebox=cropbox, mediabox, artbox, trimbox, bleedbox
local bbox_option = "auto"
function process_arguments.B(script_arguments)
    bbox_option = remove(script_arguments, 1)
end

-- > Show this help message and exit
function process_arguments.h(script_arguments)
    print [[
Usage: extractbb [-B pagebox] [-p page] [-q|-v] [-O] [-m|-x] FILE...
       extractbb --help|--version
Extract bounding box from PDF, PNG, JPEG, JP2, or BMP file; default output below.

Options:
  -B pagebox    Specify a PDF pagebox for bounding box
                pagebox=cropbox, mediabox, artbox, trimbox, bleedbox
  -h | --help   Show this help message and exit
  --version     Output version information and exit
  -p page       Specify a PDF page to extract bounding box
  -q            Be quiet
  -v            Be verbose
  -O            Write output to stdout
  -m            Output .bb  file used in DVIPDFM (default)
  -x            Output .xbb file used in DVIPDFMx
]]
    os.exit(0)
end

process_arguments["-help"] = process_arguments.h

-- > Output version information and exit
function process_arguments.V(script_arguments)
    print(version)
    os.exit(0)
end

process_arguments["-version"] = process_arguments.V

-- > Specify a PDF page to extract bounding box
local page_number = 1
function process_arguments.p(script_arguments)
    page_number = tonumber(remove(script_arguments, 1))
end

-- > Be quiet
function process_arguments.q(script_arguments)
    show_errors = false
end

-- > Be verbose
function process_arguments.v(script_arguments)
    show_errors = true
end

-- > Write output to stdout
local output_file
function process_arguments.O(script_arguments)
    output_file = io.stdout
end

-- Output format
local output_format = "xbb"

if script_arguments[0]:match("ebb") then
    output_format = "bb"
end

-- > Output .bb  file used in DVIPDFM (default)
function process_arguments.m(script_arguments)
    output_format = "bb"
end

-- > Output .xbb file used in DVIPDFMx
function process_arguments.x(script_arguments)
    output_format = "xbb"
end

-- Get the input file name.
local input_name
function process_arguments.i(script_arguments)
    input_name = remove(script_arguments, 1)
end

process_arguments["-input-name"] = process_arguments.i

-- Clear the interpreter and script names.
script_arguments[-1] = nil
script_arguments[0]  = nil

-- Process the arguments.
while script_arguments[1] do
    -- Get the next argument.
    local arg = remove(script_arguments, 1)
    local cmd = arg:match("^%-(.*)$")

    -- Default to "--input-name" if no command is given.
    if not cmd then
        insert(script_arguments, 1, arg)
        cmd = "-input-name"
    end

    -- Handle multi-character arguments.
    if (cmd:len() >= 2) and (not cmd:match("^%-")) then
        local i = 0
        for char in cmd:gmatch(".") do
            i = i + 1
            insert(script_arguments, i, "-" .. char)
        end

        goto continue
    end

    -- Get the function to process the argument and run it.
    local func = process_arguments[cmd]

    if not func then
        error("Invalid argument:", arg)
    end

    func(script_arguments)

    ::continue::
end

-- Validate the arguments.
if not type(page_number) == "number" then
    error("Invalid page number:", page_number)
end

if not input_name then
    error("No input file specified.")
end

-- Validate the bounding box type. We need this rather crazy fallback scheme
-- to match the behaviour of "extractbb".
local bbox_orders = {}
bbox_orders.mediabox = {
    { img = "media", pdfe = "MediaBox" },
}
bbox_orders.cropbox = {
    { img = "crop", pdfe = "CropBox" }, unpack(bbox_orders.mediabox)
}
bbox_orders.artbox = {
    { img = "art", pdfe = "ArtBox" }, unpack(bbox_orders.cropbox)
}
bbox_orders.trimbox = {
    { img = "trim", pdfe = "TrimBox" }, unpack(bbox_orders.artbox)
}
bbox_orders.bleedbox = {
    { img = "bleed", pdfe = "BleedBox" }, unpack(bbox_orders.trimbox)
}
bbox_orders.auto = {
    bbox_orders.cropbox[1], bbox_orders.artbox[1], bbox_orders.trimbox[1],
    bbox_orders.bleedbox[1], bbox_orders.mediabox[1],
}

local bbox_order = bbox_orders[bbox_option]

if not bbox_order then
    error("Invalid PDF box type:", bbox_option)
end

-- Set the default pixel resolution.
local default_dpi
if output_format == "xbb" then
    default_dpi = 72
elseif output_format == "bb" then
    default_dpi = 100
else
    error("Invalid output format:", output_format)
end

-- Open the output file.
if not output_file then
    local base_name   = input_name:match("(.+)%.([^.]-)$") or input_name
    local output_name = base_name .. "." .. output_format
    output_file = open_file(output_name, "write", "text")
end


------------------------
--- Image Processing ---
------------------------

local x_min, y_min, x_max, y_max
local num_pages, image_type
local pdf_major_version, pdf_minor_version

if open_image then
    -- Check the number of pages.
    local image = open_image(input_name)
    num_pages = image.pages

    if page_number > num_pages then
        error("Invalid page number:", page_number)
    end

    -- Open the image to the specified page and bounding box. If the requested
    -- bounding box is not available, LuaTeX will fall back to the crop box
    -- or the media box.
    image = open_image(input_name, page_number, bbox_order[1].img)

    if not image then
        error("Cannot open image:", input_name)
    end

    -- Get the image metadata.
    image_type   = image.imagetype
    local bounding_box = image.bbox

    if not bounding_box then
        error("Cannot get bounding box:", page_number)
    end

    local x_resolution = image.xres
    local y_resolution = image.yres

    if (x_resolution or 0) == 0 then
        x_resolution = default_dpi
    end

    if (y_resolution or 0) == 0 then
        y_resolution = default_dpi
    end

    -- Convert the bounding box to PostScript points.
    for i, dimen in ipairs(bounding_box) do
        if image_type == "pdf" then
            dimen = dimen / BP_TO_SP
        else
            if i % 2 == 1 then
                dimen = dimen / x_resolution * IN_TO_BP
            else
                dimen = dimen / y_resolution * IN_TO_BP
            end
        end

        bounding_box[i] = dimen
    end

    -- Save the bounding box.
    x_min, y_min, x_max, y_max = unpack(bounding_box)

    -- We can't get the PDF version with the "img" library, so we'll just
    -- pretend that it's v1.5 (which supports most features).
    pdf_major_version = 1
    pdf_minor_version = 5
else
    -- Fallback to PDFs only.
    image_type = "pdf"
    local document = pdfe.open(input_name)

    if pdfe.getstatus(document) ~= 0 then
        error("Cannot open PDF file:", input_name)
    end

    -- Check the number of pages.
    num_pages = pdfe.getnofpages(document)

    if type(num_pages) ~= "number" then
        error("Invalid number of pages:", num_pages)
    end

    if page_number > num_pages then
        error("Invalid page number:", page_number)
    end

    -- Get the page.
    local page = pdfe.getpage(document, page_number)

    if not page then
        error("Cannot get page:", page_number)
    end

    -- Get the bounding box. Here, we check the boxes in the exact same order
    -- that "extractbb" does.
    local bounding_box
    for _, bbox in ipairs(bbox_order) do
        bounding_box = pdfe.getbox(page, bbox.pdfe)

        if bounding_box then
            break
        end
    end

    if not bounding_box then
        error("Cannot get bounding box:", page_number)
    end

    -- Save the bounding box.
    x_min, y_min, x_max, y_max = unpack(bounding_box)

    -- Get the PDF version.
    pdf_major_version, pdf_minor_version = pdfe.getversion(document)
end

-- Validate the bounding box.
for _, dimen in ipairs { x_min, y_min, x_max, y_max } do
    if type(dimen) ~= "number" then
        error("Invalid bounding box:", x_min, y_min, x_max, y_max)
    end
end


--------------
--- Output ---
--------------

-- Get the output fields and values.
local lines = {}

insert(lines, ("Title: %s"):format(input_name))
insert(lines, ("Creator: %s"):format(version))
insert(lines,
       ("BoundingBox: %d %d %d %d")
       :format(round(x_min), round(y_min), round(x_max), round(y_max)))

if output_format == "xbb" then
    insert(lines,
           ("HiResBoundingBox: %0.6f %0.6f %0.6f %0.6f")
           :format(x_min, y_min, x_max, y_max))

    if image_type == "pdf" then
        insert(lines,
               ("PDFVersion: %d.%d")
               :format(pdf_major_version, pdf_minor_version))

        insert(lines, ("Pages: %d"):format(num_pages))
    end

end

insert(lines, ("CreationDate: %s"):format(os.date(DATE_FORMAT, SOURCE_DATE_EPOCH)))

-- Create the output text.
local begin_line = "%%"
local end_line   = "\n"

local text = begin_line ..
             table.concat(lines, end_line .. begin_line) ..
             end_line .. end_line

-- Write the output text.
output_file:write(text)
output_file:close()

-- Everything is done, so now we can exit.
os.exit(0)