#! /usr/bin/python
"""xtab.py
PURPOSE
Read a table (from a text file) of data in normalized form and cross-tab it,
allowing multiple data columns to be crosstabbed.
AUTHOR
R. Dreas Nielsen (RDN)
COPYRIGHT AND LICENSE
Copyright (c) 2007-2018, R.Dreas Nielsen
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
The GNU General Public License is available at
NOTES
1. This code can be used either as a module or as a stand-alone script.
2. The sole function intended to be used by callers of this module is 'xtab()'.
3. When there are multiple values in the input that should go into a single
cell of the output, only the first of these is written into that cell
('first' is indeterminate). The 'xtab()' function allows logging of
the data selection statement (SQL) used to obtain the data for each cell,
and the result(s) obtained, and thus to determine which cell(s) have
multiple values.
=====================================================================================
"""
#=====================================================================================
# TODO:
# * Implement a class to wrap csv reader and writer objects
# to ensure appropriate closure of underlying file objects
# on error. (Or use 'with' in 2.6/3.0)
# * Implement error-checking of the column names in the command
# line arguments prior to calling 'xtab()'.
# * Add more specific error traps throughout.
#=====================================================================================
_version = "1.1.0"
_vdate = "2023-03-29"
import sys
import os
import os.path
import csv
import sqlite3
import copy
import logging
import traceback
_errmsg_noinfile = "No input filename specified."
_errmsg_badinfile = "Input file does not exist."
_errmsg_nooutfile = "No output filename specified."
_errmsg_norowheaders = "No row header columns specified."
_errmsg_nocolumheaders = "No column header columns specified."
_errmsg_nocellcolumns = "No cell value columns specified."
_errmsg_baderrlogfile = "Only one error log file name should be specified."
_errmsg_badsqllogfile = "Only one SQL log file name should be specified."
_errmsg_nohdrsep = "No header column separator character specified."
_errmsg_hdrseplen = "The header column separator must be a single character."
__help_msg = """Required Arguments:
-i
The name of the input file from which to read data.
This must be a text file, with data in a normalized format.
The first line of the file must contain column names.
-o
The name of the output file to create.
The output file will be created as a .csv file.
-r [column_name2 [...]]
One or more column names to use as row headers.
Unique values of these columns will appear at the beginning of every
output line.
-c [column_name2 [...]]
One or more column names to use as column headers in the output.
A crosstab column (or columns) will be created for every unique
combination of values of these fields in the input.
-v [column_name2 [...]]
One or more column names with values to be used to fill the cells
of the cross-table. If n columns names are specified, then there will
be n columns in the output table for each of the column headers
corresponding to values of the -c argument. The column names specified
with the -v argument will be appended to the output column headers
created from values of the -c argument. There should be only one value
of the -v column(s) for each combination of the -r and -c columns;
if there is more than one, a warning will be printed and only the first
value will appear in the output. (That is, values are not combined in
any way when there are multiple values for each output cell.)
Optional Arguments:
-d[1|2|3|4]
Controls the format of column headers. The four alternatives are:
-d1 or no option specified
One row of column headers, with elements joined by underscores
to facilitate parsing by other programs.
-d or -d2
Two rows of column headers. The first row contains values of the
columns specified by the -c argument, and the second row contains
the column names specified by the -v argument.
-d3
One header row for each of the values of the columns specified by
the -c argument, plus one row with the column names specified by
the -v argument.
-d4
Like -d3, but the values of the columns specified by the -c
argument are labeled with (preceded by) the column names.
-f
Use a temporary (sqlite) file instead of memory for intermediate
storage.
-k
Keep (i.e., do not delete) the sqlite file. Only useful with the
"-f" option. Unless the "-t" option is also used, the table name will
be "src".
-n
Use the specified default string in the output wherever an empty or
null value would otherwise appear.
-p
The character to use to join column names for d1 header output.
The default is an underscore.
-s Sort columns and rows in ascending alphabetic order.
-t
Name to use for the table in the intermediate sqlite database. Only
useful with the "-f" and "-k" options.
-e [filename]
Log all error messages, to a file if the filename is specified or to the
console if the filename is not specified.
-q
Log the sequence of SQL commands used to extract data from the input
file to write the output file, including the result of each command.
-h
Print this help and exit.
Notes:
1. Column names should be specified in the same case as they appear in the
input file.
2. The -f option creates a temporary file in the same directory as the
output file. This file has the same name as the input file, but an
extension of '.sqlite'.
3. There are no inherent limits to the number of rows or columns in the
input or output files.
4. Missing required arguments will result in an exception rather than an
error message, whatever the error logging option. If no error logging
option is specified, then if there are multiple values per cell (the
most likely data error), a single message will be printed on the console.
"""
# enum for header row codes
hdrs_1, hdrs_2, hdrs_many, hdrs_labeled = (1, 2, 3, 4)
def xtab(infilename, rownames, xtab_colnames, xtab_datanames, outfilename,
header_rows=hdrs_1, file_db=False, keep_file_db=False, tablename="src",
error_reporter=None, sql_reporter=None, nullfill=None, sort_alpha=False, hdr1_sep='_'):
"""Cross-tab data in the specified input file and write it to the output file.
Arguments:
infilename: string of the input file name. Some diagnosis of file format
(CSV or tab formatted) will be performed.
rownames: list of strings of column names in the input file that will be used
as row headers in the output file.
xtab_colnames: list of strings of column names in the input file that will be
used as primary column headers in the output file.
xtab_datanames: list of strings of column names in the input file that will be
crosstabbed in the output file. These column names will also be used as
secondary column names in the output file.
outfilename: string of the output file name. This file will all be written as CSV.
dualheader: boolean controlling whether or not there will be one or two header
rows in the output file. If a single header row is used, then the primary and
secondary column headers will be joined in each column header. If two column
headers are used, then the primary column headers will be used on the first
line of headers, and the secondary column headers will be used on the second
line of headers.
file_db: boolean controlling whether or not the sqlite db is created as a disk file
(if True) or in memory (if False, the default).
keep_file_db: boolean controlling whether or not a sqlite disk file is retained
(if True) or deleted after it has been used.
error_reporter: logging.Logger object to report nonfatal errors (specifically, the presence
of more than one value for a cell).
sql_reporter: logging.Logger object to report the sqlite queries executed and their
results.
nullfill: the value with which to replace null (empty) values in the output.
Return value
The number of warnings or errors encountere.
When multiple column headers in the input file are used as a single column header in
the output file, the column names are joined with an underscore. This is to facilitate
any subsequent parsing to be done by other programs (e.g., R).
"""
multiple_vals = False # A flag indicating whether or not multiple values were found for a single crosstab cell
if sys.version_info < (3,0,0):
outfile = open(outfilename, "wb") # The Py2 csv module adds an extra if "wt" is specified
else:
outfile = open(outfilename, "w", newline='')
csvout = csv.writer(outfile)
reportable_errors = 0
# Move the data into sqlite for easy random access.
if file_db:
inhead, intail = os.path.split(infilename)
sqldbname = os.path.join(inhead, os.path.splitext(intail)[0] + ".sqlite")
try:
os.unlink(sqldbname)
except:
pass
else:
sqldbname = None
if tablename == None:
tablename = "src"
sqldb = copy_to_sqlite(infilename, sqldbname, tablename)
# Get list of unique values for 'xtab_colnames' columns
if sort_alpha:
sqlcmd = "select distinct %s from %s order by %s;" % (",".join(xtab_colnames), tablename, ",".join(xtab_colnames))
else:
sqlcmd = "select distinct %s from %s;" % (",".join(xtab_colnames), tablename)
xtab_vals = sqldb.execute(sqlcmd).fetchall()
# Write output headers.
if header_rows == hdrs_1:
# One header row
outstrings = [n for n in rownames]
for n in xtab_vals:
for i in xtab_datanames:
outstrings.append("%s_%s" % (hdr1_sep.join(n), i.replace("'", "''")))
csvout.writerow(outstrings)
elif header_rows == hdrs_2:
# Two header rows
extra_cols = len(xtab_datanames) - 1
# Write header row 1/2
outstrings = ['' for n in rownames]
for n in xtab_vals:
hdr = " ".join(n)
outstrings.append(hdr.replace("'", "''"))
for i in range(extra_cols):
outstrings.append('')
csvout.writerow(outstrings)
# Write header row 2/2
outstrings = [n for n in rownames]
for n in xtab_vals:
for i in xtab_datanames:
outstrings.append(i.replace("'", "''"))
csvout.writerow(outstrings)
elif header_rows == hdrs_many:
# One header row for each item in xtab_vals plus a row
# for xtab_datanames.
extra_cols = len(xtab_datanames) - 1
for i in range(len(xtab_colnames)):
outstrings = ['' for n in rownames]
for n in xtab_vals:
outstrings.append(n[i].replace("'", "''"))
for x in range(extra_cols):
outstrings.append('')
csvout.writerow(outstrings)
# Write last header row of xtab_datanames
outstrings = [n for n in rownames]
for n in xtab_vals:
for i in xtab_datanames:
outstrings.append(i.replace("'", "''"))
csvout.writerow(outstrings)
else: # header_rows == hdrs_labeled
extra_cols = len(xtab_datanames) - 1
for i in range(len(xtab_colnames)):
outstrings = ['' for n in rownames]
for n in xtab_vals:
outstrings.append("%s: %s" % (xtab_colnames[i], n[i].replace("'", "''")))
for x in range(extra_cols):
outstrings.append('')
csvout.writerow(outstrings)
# Write last header row of xtab_datanames
outstrings = [n for n in rownames]
for n in xtab_vals:
for i in xtab_datanames:
outstrings.append(i.replace("'", "''"))
csvout.writerow(outstrings)
# Write output data
# For each unique combination of row headers
# Initiate a new output line
# Get the row headers
# For every item in the xtab_vals
# Select the 'xtab_datanames' columns from sqlite for the row headers and xtab_vals
# Append the first result (set warning if >1) to the output line
# Write the output line
#
# Get a list of unique combinations of row headers
if sort_alpha:
sqlcmd = "SELECT DISTINCT %s FROM %s ORDER BY %s;" % (",".join(rownames), tablename, ",".join(rownames))
else:
sqlcmd = "SELECT DISTINCT %s FROM %s;" % (",".join(rownames), tablename)
row_hdr_vals = sqldb.execute(sqlcmd).fetchall()
row_counter = 0
for l in row_hdr_vals:
row_counter = row_counter + 1
col_counter = 0
outstrings = []
# Add the row headers to the list of outstrings
for rn in range(len(l)):
outstrings.append(l[rn].replace("'", "''"))
# Make a list of WHERE conditions for the row header variables
sqlcond = ["%s='%s'" % (rownames[i], l[i].replace("'", "''")) for i in range(len(rownames))]
for n in xtab_vals:
col_counter = col_counter + 1
# Add the WHERE conditions for the crosstab values
selcond = copy.deepcopy(sqlcond)
for cn in range(len(xtab_colnames)):
selcond.append("%s='%s'" % (xtab_colnames[cn], n[cn].replace("'", "''")))
# Create and execute the SQL to get the data values
sqlcmd = "SELECT %s FROM %s WHERE %s" % (",".join(xtab_datanames), tablename, " AND ".join(selcond))
if sql_reporter:
sql_reporter.log(logging.INFO, "%s" % sqlcmd)
data_vals = sqldb.execute(sqlcmd).fetchall()
if sql_reporter:
for r in data_vals:
sql_reporter.log(logging.INFO, "\t%s" % "\t".join(r))
if len(data_vals) > 1:
multiple_vals = True
reportable_errors += 1
if error_reporter:
error_reporter.log(logging.WARNING, "Multiple result rows for the command '%s'--only the first is used." % (sqlcmd))
if len(data_vals) == 0:
if nullfill:
for n in range(len(xtab_datanames)):
outstrings.append(nullfill)
else:
for n in range(len(xtab_datanames)):
outstrings.append('')
else:
data = data_vals[0]
for n in range(len(xtab_datanames)):
outstrings.append(data[n])
csvout.writerow(outstrings)
sqldb.close()
if file_db and not keep_file_db:
try:
os.unlink(sqldbname)
except:
pass
outfile.close()
if multiple_vals and not error_reporter:
msg = "Warning: multiple data values found for at least one crosstab cell; only the first is displayed."
print(msg)
if error_reporter:
error_reporter(msg)
return reportable_errors
def unquote(str):
"""Remove quotes surrounding a string."""
if len(str) < 2:
return str
c1 = str[0]
c2 = str[-1:]
if c1==c2 and (c1=='"' or c1=="'"):
return str[1:-1].replace("%s%s" % (c1, c1), c1)
return str
def quote_str(str):
"""Add single quotes around a string."""
if len(str) == 0:
return "''"
if len(str) == 1:
if str == "'":
return "''''"
else:
return "'%s'" % str
if str[0] != "'" or str[-1:] != "'":
return "'%s'" % str.replace("'", "''")
return str
def quote_list(l):
"""Add single quotes around all strings in the list."""
return [quote_str(x) for x in l]
def quote_list_as_str(l):
"""Convert a list of strings to a single string of comma-delimited, quoted tokens."""
return ",".join(quote_list(l))
def del_file(fn):
"""Deletes the specified file if it exists."""
if os.path.isfile(fn):
os.unlink(fn)
def copy_to_sqlite(data_fn, sqlite_fn=None, tablename="src"):
"""Copies data from a CSV file to a sqlite table.
Arguments:
data_fn: a string of the data file name with the data to be read.
sqlite_fn: a string of the name of the sqlite file to create, or None if
sqlite is to use memory instead.
tablename: the name of the sqlite table to create
Value:
The sqlite connection object.
"""
dialect = csv.Sniffer().sniff(open(data_fn, "rt").readline())
inf = csv.reader(open(data_fn, "rt"), dialect)
column_names = next(inf)
if sqlite_fn == None:
conn = sqlite3.connect(":memory:")
else:
try:
os.unlink(sqlite_fn)
except:
pass
conn = sqlite3.connect(sqlite_fn)
if tablename == None:
tablename = "src"
colstr = ",".join(column_names)
try:
conn.execute("drop table %s;" % tablename)
except:
pass
conn.execute("create table %s (%s);" % (tablename, colstr))
for l in inf:
sql = "insert into %s values (%s);" % (tablename, quote_list_as_str(l))
conn.execute(sql)
conn.commit()
return conn
def print_help():
"""Print a program description and brief usage instructions to the console."""
print("xtab %s %s -- Cross-tabulates data." % (_version, _vdate))
print(__help_msg)
def get_opts(arglist):
"""Returns a dictionary of command-line arguments. This custom 'getopt' routine is used
to allow multiple column names for the -r, -c, and -v arguments with only one use of each
flag.
"""
argdict = {}
nargs = len(arglist)
argno = 1
currarg = None
currargitems = []
while argno < nargs:
arg = arglist[argno]
if len(arg) > 0:
if arg[0] == '-':
if currarg:
argdict[currarg] = currargitems
currarg = arg
currargitems = []
else:
if currarg:
currargitems.append(arg)
else:
argdict[arg] = []
argno += 1
if currarg:
argdict[currarg] = currargitems
return argdict
def main():
"""Read and interpret the command-line arguments and options, and carry out
the appropriate actions."""
args = get_opts(sys.argv)
if len(args) == 0 or '-h' in args or '--help' in args:
print_help()
sys.exit(0)
badopts = [ o for o in args.keys() if o not in ['-i', '-o', '-r',
'-c', '-v', '-d', '-d1', '-d2', '-d3', '-d4', '-f', '-k', '-s', '-t',
'-n', '-e', '-q', '-p'] ]
if len(badopts) > 0:
raise ValueError("Unrecognized option(s): %s" % ", ".join(badopts))
if '-i' in args:
if len(args['-i']) == 0:
raise ValueError(_errmsg_noinfile)
infilename = args['-i'][0]
if not os.path.exists(infilename):
raise ValueError("%s (%s)" % (_errmsg_badinfile, infilename))
else:
raise ValueError(_errmsg_noinfile)
#
if '-o' in args:
if len(args['-o']) == 0:
raise ValueError(_errmsg_nooutfile)
outfilename = args['-o'][0]
else:
raise ValueError(_errmsg_nooutfile)
#
if '-r' in args:
if len(args['-r']) == 0:
raise ValueError(_errmsg_norowheaders)
rowheaders = args['-r']
else:
raise ValueError(_errmsg_norowheaders)
#
if '-c' in args:
if len(args['-c']) == 0:
raise ValueError(_errmsg_nocolumheaders)
columnheaders = args['-c']
else:
raise ValueError(_errmsg_nocolumheaders)
#
if '-v' in args:
if len(args['-v']) == 0:
raise ValueError(_errmsg_nocellcolumns)
cellvalues = args['-v']
else:
raise ValueError(_errmsg_nocellcolumns)
#
if '-p' in args:
if len(args['-p']) == 0:
raise ValueError(_errmsg_nohdrsep)
hdr1_sep = args['-p']
hdr1_sep = ''.join(hdr1_sep)
if len(hdr1_sep) > 1:
raise ValueError(_errmsg_hdrseplen)
else:
hdr1_sep = '_'
#
hdr_opt = hdrs_1
if '-d' in args or '-d2' in args:
hdr_opt = hdrs_2
if '-d3' in args:
hdr_opt = hdrs_many
if '-d4' in args:
hdr_opt = hdrs_labeled
file_db = '-f' in args
keep_file_db = '-k' in args
tablename = 'src'
if '-t' in args:
if len(args['-t']) == 1:
tablename = args['-t'][0]
nullfill = None
if '-n' in args:
if len(args['-n']) == 1:
nullfill = args['-n'][0]
sort_alpha = '-s' in args
#
# Set up logging
#logging.basicConfig(level=logging.INFO, filemode="w", filename='')
err_logger = None
sql_logger = None
error_file = None
if '-e' in args:
err_logger = logging.getLogger("err")
err_logger.setLevel(logging.WARNING)
if len(args['-e']) == 0:
err_logger.addHandler(logging.StreamHandler())
else:
if len(args['-e']) > 1:
raise ValueError(_errmsg_baderrlogfile)
error_file = args['-e'][0]
del_file(error_file)
file_logger = logging.FileHandler(error_file, "w")
err_logger.addHandler(file_logger)
if '-q' in args:
if len(args['-q']) != 1:
raise ValueError(_errmsg_badsqllogfile)
sql_logger = logging.getLogger("sql")
sql_logger.setLevel(logging.INFO)
sql_logger.addHandler(logging.FileHandler(args['-q'][0], "w"))
#
errors = xtab(infilename, rowheaders, columnheaders, cellvalues, outfilename, hdr_opt,
file_db, keep_file_db, tablename, err_logger, sql_logger, nullfill, sort_alpha, hdr1_sep)
if errors == 0 and error_file is not None:
# Logger can create the file if a message below the warning level
# is issued, even though it will not be logged.
file_logger.close()
del_file(error_file)
if __name__=='__main__':
try:
main()
except SystemExit as x:
sys.exit(x)
except ValueError as e:
sys.stderr.write("%s\n" % str(e))
sys.exit(1)
except Exception:
strace = traceback.extract_tb(sys.exc_info()[2])[-1:]
lno = strace[0][1]
src = strace[0][3]
sys.stderr.write("%s: Uncaught exception %s (%s) on line %s (%s)." % (os.path.basename(sys.argv[0]), str(sys.exc_info()[0]), sys.exc_info()[1], lno, src))
sys.exit(1)