|
#!/usr/bin/env bash |
|
# sudo ln -s "$(pwd)/csv2datasette" /usr/bin/csv2datasette |
|
|
|
# csv2datasette is meant to explore CSV data. It is not meant to create a sustainable DB. |
|
# csv2datasette is a bash script which open CSV files directly in Datasette. It offers |
|
# a number of options for reading and exploring CSV files, such as --stats, inspired by WTFCsv. |
|
# |
|
# `--stats` option includes, for each column: the column name, the number of unique values, |
|
# the number of filled rows, the number of missing values, the mininmum value, the maximum value, |
|
# the average, the sum, the shortest string, the longest string, the number of numeric values, |
|
# the number of text values, the number of probable ISO dates, the top 3 frequent values. |
|
|
|
# csv2datasette is only tested under Linux. |
|
|
|
# Its default behavior is to: |
|
# * create a DB in a random temporary directory (under /tmp), each time the script is used |
|
# * normalize database name (avoid spaces, hyphens and dots), and use it for the table name |
|
# * rename columns to avoid spaces |
|
# * launch Datasette on a random unused port, and open the browser |
|
# * delete the temporary DB at the end of the script. |
|
# |
|
# -e | --generate-column-names > option allows to automatically add column names if the CSV doesn't have some. |
|
# -s "," | --separator "," > allows to change the CSV separator (aka delimiter) |
|
# -t | > CSV separator is a tab |
|
# -d "," | --decimal-separator "," > allows to specify the decimal separator used for numbers (typically "," instead of "." in France) |
|
# --stats > add table called "stats" with stats like WTFCsv: https://databasic.io/en/wtfcsv/ |
|
# |
|
# If you need to add different or more options to datasette command line, use ~/.csv2datasetterc file. |
|
# ~/.csv2datasetterc is "bash style" so don't forget "\" at the of each line if you have multiple lines. Eg.: |
|
# --setting truncate_cells_html 512 \ |
|
# --load-extension /usr/lib/sqlite3/pcre.so |
|
|
|
# Other features: |
|
# * verify if datasette is well installed |
|
# * stats: following Bertin's advices, treat zeros (0) differently than other walues, to ease stats reading |
|
# for the following columns: filled, missing, numericValues, textValues, probISODate |
|
# We have chosen to display zeros in a very light gray |
|
# * stats: round avg and sum to 5 decimals |
|
|
|
|
|
# Inspired from https://gist.github.com/ahebrank/752fdb77876ebe86276df100bca55e79 |
|
|
|
|
|
# ### |
|
# TODO (or not). (In approximate order of importance.) |
|
# * Test https://github.com/visprex/visprex for inspiration |
|
# * [DB] verify the CSV file is not too big ("... do you want to continue?") |
|
# * [DB] allows to import CSV files from http links? |
|
# * [Service] stats: medata.yml to display information about the stats table: columns documentation, aggregated stats (average fillness...)... |
|
# * https://docs.datasette.io/en/stable/metadata.html |
|
# * Description of the CSV |
|
# This table represent data imported from ___ CSV with '|' separator and ',' as the decimal separator. |
|
# This table contains XX columns that are YY% complete. |
|
# NN columns are fully completed, ZZ almost completed, PP empty, and JJ almost empty. |
|
# You can get [stats] related to each columns. |
|
# * [Service] stats: when few values, display barchart as in https://databasic.io/en/wtfcsv/ |
|
# * [Service] stats: data quality stats: data type homogeneity (all NUMERIC?), weird values, null island, epoch time... |
|
# * [Service] save DB in its original form, without any transformation, in a dedicated table |
|
# * [Service] convert dates in ISO-8601 (well suited for sorting)? |
|
# * [Service] manage null value? |
|
# * [Service] manage monetary formats? |
|
# * [Service] detect UNIX timestamp format (eg. 1623855208) -- https://en.wikipedia.org/wiki/Unix_time |
|
# => integers between 0 and SELECT strftime('%s', 'now'); |
|
# or SELECT (julianday('now') - 2440587.5) * 86400.0; /* including the fractional part */ |
|
|
|
|
|
# ### |
|
# Init |
|
# |
|
OPTIONS="" |
|
_V=0 |
|
SEPARATOR="," |
|
|
|
# Exit if datasette is not installed |
|
[[ ! $(datasette --version) ]] && { echo "Datasette is not installed or not reachable. \ |
|
See Datasette documentation: https://datasette.io/"; \ |
|
exit 1; } |
|
|
|
# test if REGEXP module is installed |
|
regexp_installed=$(sqlite3 -line <<< "SELECT 'Hello, world' REGEXP 'world';") |
|
if [[ "$?" != "0" ]]; then |
|
echo -e "REGEXP module is not working. --stats option can't work.\n |
|
For more information, see: https://gist.github.com/CharlesNepote/3fdb8982dc873f34aa7c320bef580fe3#file-readme-md" |
|
fi |
|
|
|
usage=$(cat <<EOF |
|
Usage: csv2datasette [ -e | --generate-column-names ] |
|
[ -s | --separator "," ] |
|
[ -t ] |
|
[ --decimal-separator ","] |
|
[ --stats ] |
|
[ -v ] |
|
filename(s) |
|
|
|
csv2datasette [-h | --help] # displays this help. |
|
|
|
EOF |
|
) |
|
usage() { |
|
echo "${usage}" |
|
exit 2 |
|
} |
|
|
|
log() { [[ ${_V} -eq 1 ]] && echo -e "$*"; } |
|
|
|
# 's:' implies that option -s has value and is a mandatory option. ':' means has a value. |
|
PARSED_ARGUMENTS=$(getopt -a -n csv2datasette \ |
|
-o hes:td:v \ |
|
--long help,interactive,add-to-current-tables,generate-column-names,stats,separator:,tsv,decimal-separator: -- "$@") |
|
VALID_ARGUMENTS=$? |
|
[[ "${VALID_ARGUMENTS}" != "0" ]] && usage |
|
|
|
eval set -- "${PARSED_ARGUMENTS}" |
|
while : |
|
do |
|
case "$1" in |
|
-h | --help) usage ;; |
|
-e | --generate-column-names) COL_NAMES="generated" ; shift ;; |
|
-s | --separator) SEPARATOR="$2" ; shift 2 ;; |
|
-t | --tsv) SEPARATOR="\t" ; shift ;; |
|
-d | --decimal-separator) decimal="$2" ; shift 2 ;; |
|
--stats) STATS=1 ; shift ;; |
|
-v) _V=1 ; shift ;; |
|
# -- means the end of the arguments; drop this, and break out of the while loop |
|
--) shift; break ;; |
|
# If invalid options were passed, then getopt should have reported an error, |
|
# which we checked as VALID_ARGUMENTS when getopt was called... |
|
*) echo "Unexpected option: $1 - this should not happen." |
|
usage ;; |
|
esac |
|
done |
|
log "PARSED_ARGUMENTS is ${PARSED_ARGUMENTS}" |
|
log "Parameters remaining are: $*" |
|
|
|
# Test file |
|
[[ -f $@ ]] || { echo "$@ not existing..."; usage; exit 1; } |
|
|
|
# normalize file name to avoid creation of a table with dots (.), spaces ( ) |
|
# or hyphens (-). See csvs-to-sqlite issue: "option to lower case column names, |
|
# replace space with underscore, and remove reservered character from column names #86" |
|
# => https://github.com/simonw/csvs-to-sqlite/issues/86 |
|
# |
|
# shellcheck disable=SC2001 |
|
name=$(echo "$@" | sed 's/^\(.*\)\.\(.*\)$/\1/') # remove extension (my_data.csv => my_data) |
|
# shellcheck disable=SC2001 |
|
name=$(echo "${name}" | sed 's/[.\ -]/_/g') # replace dots (.), spaces ( ) or hyphens (-) by underscore (_) |
|
log "Table name: ${name}" |
|
# Store all files in a temporary dir: /tmp/csv2datasette.name.jkfhkhfjuygh |
|
tempdir=$(mktemp -d "${TMPDIR:-/tmp/}$(basename "$0").${name}.XXXXXXXXXXXX") |
|
log "Temp dir: ${tempdir}" |
|
database="${tempdir}/${name}.db" |
|
|
|
# ### |
|
# Column names |
|
# |
|
if [[ ${COL_NAMES} == "generated" ]]; then |
|
# Guessing the number of columns can be tricky when the CSV file use double quotes. |
|
# Commons GNU tools can be fooled, that's why at first we try to use smarter tools |
|
# if they are already installed. |
|
# |
|
# Try to guess number of columns with csvkit |
|
if [[ $(csvcut -h) ]]; then |
|
NB_COLS=$(csvcut -n "$@" | wc -l) |
|
# TODO: exit if 0 column |
|
log "csvcut have guessed the number of columns: ${NB_COLS}" |
|
header="col_1" |
|
for ((i=2; i <= NB_COLS; i++)); do header="${header},col_${i}"; done |
|
log "Adding header: ${header}\n" |
|
sed "1 i\\${header}" "$@" > "${tempdir}/${name}.tmp.csv" |
|
# Else add header thanks to awk |
|
# TODO: also try with other tools: https://www.unix.com/shell-programming-and-scripting/138223-how-find-count-commas-string-excluding-ones-double-quotes.html |
|
else |
|
# https://stackoverflow.com/questions/52044576/how-to-add-a-header-to-text-file-in-bash |
|
log "Adding header with awk...\n" |
|
awk -F "${SEPARATOR}" 'NR==1{for(i=1;i<=NF;i++) printf "col_%d%s", $i,(i==NF?ORS:FS)}1' \ |
|
"$@" > "${tempdir}/${name}.tmp.csv" |
|
fi |
|
else |
|
head -n 1 "$@" | grep -q " " |
|
if [[ $? != 0 ]]; then |
|
# If there is no space in the columns headers, just make link |
|
# shellcheck disable=SC2164 |
|
CSV_FILE_FULL_PATH="$(cd "$(dirname "$@")"; pwd -P)/$(basename "$@")" |
|
log "Link ${CSV_FILE_FULL_PATH} to ${tempdir}/${name}.tmp.csv" |
|
ln -s "${CSV_FILE_FULL_PATH}" "${tempdir}/${name}.tmp.csv" |
|
else |
|
log "rename columns to avoid spaces..." |
|
# TODO: this is very long with a big file! |
|
sed '1 s/ /_/g' "$@" > "${tempdir}/${name}.tmp.csv" |
|
fi |
|
fi |
|
|
|
# ### |
|
# Creating database... |
|
# |
|
[[ $SEPARATOR = "\t" ]] && DELIMITER="--tsv" || DELIMITER="--csv --delimiter=${SEPARATOR}" |
|
command="sqlite-utils insert ${database} ${name} ${tempdir}/${name}.tmp.csv ${OPTIONS} ${DELIMITER}" |
|
log "\n\nsqlite-utils command: ${command}" |
|
# TODO: this can be very long for big files. Eg. 50+ minutes for Open Food Facts CSV as of 2023-12-14. |
|
# Use something different than sqlite-utils? |
|
${command} |
|
[[ $? != 0 ]] && { log "Error $?. Exiting..."; rm -r "${tempdir}"; exit 1; } |
|
|
|
# ### |
|
# Improving database if necessary |
|
# |
|
# Convert decimal separator |
|
if [[ ${decimal} != "" ]]; then |
|
readarray columns < <( sqlite3 "${database}" "SELECT name FROM PRAGMA_TABLE_INFO('${name}');" ) |
|
for col in "${columns[@]}"; do col=$(echo "${col}" | sed -e 's/[[:space:]]//g') |
|
log "Column: ${col}" |
|
sqlite3 "${database}" \ |
|
"UPDATE ${name} SET [${col}] = CAST(replace([${col}], '${decimal}', '.') AS NUMERIC) WHERE [${col}] LIKE '%${decimal}%';" |
|
done |
|
fi |
|
|
|
# ### |
|
# Creating stats... |
|
# |
|
if [[ ${STATS} == 1 ]]; then |
|
log "\nBuilding stats..." |
|
# Number of entries in the table |
|
nbOfEntries=$(sqlite3 "${database}" "select count(*) from [${name}];") |
|
sqlite3 "${database}" \ |
|
"create table stats(column_name TEXT, |
|
uniq INT, |
|
filled INT, |
|
missing INT, |
|
min TEXT, |
|
max TEXT, |
|
avg INT, |
|
sum INT, |
|
shortestStr INT, |
|
longestStr INT, |
|
numericValues INT, |
|
textValues INT, |
|
probISODate INT, |
|
top_3_freq TEXT);" |
|
readarray columns < <( sqlite3 "${database}" "SELECT name FROM PRAGMA_TABLE_INFO('${name}');" ) |
|
for col in "${columns[@]}"; do col=$(echo "${col}" | sed -e 's/[[:space:]]//g') |
|
log "\nColumn: ${col}" |
|
nbOfMissingValues=$(sqlite3 "${database}" "select count([${col}]) from [${name}] where [${col}] == '';") |
|
nbOfFilledValues=$(sqlite3 "${database}" "select count([${col}]) from [${name}] where [${col}] != '';") |
|
nbOfDistinctValues=$(sqlite3 "${database}" "select count(distinct [${col}]) from [${name}];") |
|
# Don't compute anything if the column is empty |
|
if (( nbOfEntries - nbOfMissingValues == 0 )); then |
|
log "column ${col} is empty..." |
|
sqlite3 "${database}" \ |
|
"insert into stats (column_name, uniq, filled, missing) values ('${col}', ${nbOfDistinctValues}, ${nbOfFilledValues}, ${nbOfMissingValues});" |
|
continue |
|
fi |
|
|
|
# If a column is clearly text, do not compute sum and average |
|
nbOfTextValues=$(sqlite3 "${database}" "select count(*) from [${name}] where [${col}] != '' |
|
and not ( [${col}] REGEXP '^[-+]?([0-9]*[\.])?[0-9]+([eE][-+]?\d+)?$' |
|
and [${col}] not REGEXP '^[-+]?0[0-9]+' /* exclude 056 or -082 */ );") |
|
nbOfNumericValues=$(sqlite3 "${database}" "select count(*) from [${name}] where [${col}] != '' |
|
and [${col}] REGEXP '^[-+]?([0-9]*[\.])?[0-9]+([eE][-+]?\d+)?$' |
|
and [${col}] not REGEXP '^[-+]?0[0-9]+' /* exclude 056 or -082 */;") |
|
log "nbOfEntries (${nbOfEntries}) - nbOfTextValues (${nbOfTextValues}) = $(( nbOfTextValues - nbOfEntries ))" |
|
|
|
textValuesPercent=$(echo "scale=2; (${nbOfTextValues}/${nbOfFilledValues})*100"|bc) |
|
log "textValuesPercent: ${textValuesPercent}" |
|
numericValuesPercent=$(echo "scale=2; (${nbOfNumericValues}/${nbOfFilledValues})*100"|bc) |
|
log "numericValuesPercent: ${numericValuesPercent}" |
|
|
|
# TODO: isText=1 if 50% of text values? |
|
# TODO: isNumeric=1 if 90% of numeric values (test if this work for codes such as EAN) |
|
(( $(bc <<<"${textValuesPercent} > 90") )) && isText=1 || isText=0 |
|
log "isText: ${isText}" |
|
|
|
sqlite3 "${database}" \ |
|
"insert into stats (column_name, uniq, filled, missing, min, max, avg, sum, shortestStr, longestStr, numericValues, textValues, probISODate, top_3_freq) |
|
values ( |
|
'${col}', |
|
-- uniq |
|
${nbOfDistinctValues}, |
|
-- filled |
|
${nbOfFilledValues}, |
|
-- missing |
|
${nbOfMissingValues}, |
|
-- min |
|
(select min([${col}]) from [${name}]), |
|
-- max |
|
(select max([${col}]) from [${name}]), |
|
-- Compute average if it's a numeric column |
|
( $( (( isText == 1 )) && echo "''" || echo "select round(avg([${col}]) * 100000) / 100000.0 from [${name}]") ), |
|
-- Compute sum if it's a numeric column |
|
( $( (( isText == 1 )) && echo "''" || echo "select round(sum([${col}]) * 100000) / 100000.0 from [${name}]") ), |
|
-- shortestStr |
|
(select length([${col}]) as l from [${name}] order by l asc limit 1), |
|
-- longestStr |
|
(select length([${col}]) as l from [${name}] order by l desc limit 1), |
|
-- numericValues |
|
${nbOfNumericValues}, |
|
-- textValues |
|
${nbOfTextValues}, |
|
-- probISODate |
|
(select count(*) from [${name}] where [${col}] is not null |
|
and [${col}] |
|
REGEXP '^[1-9][0-9]?[0-9]?[0-9]?((-?(0[13578]|1[02])(-?(0[1-9]|[12][0-9]|3[01]))?|-?((0[469]|11)-?(0[1-9]|[12][0-9]|30))?|-?(02-?(0[1-9]|[12][0-9]))?)?((T| )(0[0-9]|1[0-9]|2[0-3]):(0[0-9]|[1-5][0-9])(:(0[0-9]|[1-5][0-9])((\.[0-9][0-9]?[0-9]?)?(Z|\+[0-2][0-9]:[0-5][0-9])?)?)?)?)?$' |
|
/**/), |
|
(select group_concat(n, char(10)) from ( |
|
select [${col}] || ' (' || count([${col}]) || ')' as n from [${name}] GROUP BY [${col}] ORDER BY COUNT([${col}]) DESC limit 3) |
|
) |
|
);" |
|
done |
|
fi |
|
|
|
|
|
# ### |
|
# Creating metadata file |
|
# |
|
metadata=$(cat <<EOF |
|
extra_css_urls: |
|
- /static/datasette2csv.css |
|
extra_js_urls: |
|
- /static/datasette2csv.js |
|
EOF |
|
) |
|
echo "${metadata}" >> "${tempdir}/metadata.yml" |
|
|
|
mkdir -v "${tempdir}/static" |
|
js=$(cat << EOF |
|
/* Add the "zero" CSS class when some stats table columns contains zero (to ease reading) */ |
|
window.addEventListener('load', function () { |
|
let classes = ["col-filled", "col-missing", "col-numericValues", "col-textValues", "col-probISODate"]; |
|
for (const c of classes) { |
|
let x = document.getElementsByClassName(c); |
|
for (let i = 0; i < x.length; i++) { |
|
console.log(i); console.log(document.getElementsByClassName(c)[i].innerText); |
|
if (document.getElementsByClassName(c)[i].innerText == 0) { x[i].classList.add('zero'); } |
|
} |
|
} |
|
}); |
|
EOF |
|
) |
|
echo "${js}" >> "${tempdir}/static/datasette2csv.js" |
|
|
|
css=$(cat << EOF |
|
.zero { color: RGBA(0,0,0,.2) !important; } |
|
EOF |
|
) |
|
echo "${css}" >> "${tempdir}/static/datasette2csv.css" |
|
|
|
#read -p "Are you sure? " -n 1 -r; echo |
|
|
|
|
|
# ### |
|
# Datasette port |
|
# |
|
# Use a port which is not already used |
|
port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') |
|
|
|
|
|
# ### |
|
# Launch datasette |
|
# |
|
log "\nLaunching datasette... (Press CTRL+C to quit)" |
|
# See: https://docs.datasette.io/en/stable/cli-reference.html |
|
# -h, --host TEXT Host for server. Defaults to 127.0.0.1 which means only connections from the local machine |
|
# will be allowed. Use 0.0.0.0 to listen to all IPs and allow access from other machines. |
|
# -p, --port INTEGER RANGE Port for server, defaults to 8001. Use -p 0 to automatically assign an available port. |
|
# [0<=x<=65535] |
|
# -o, --open Open Datasette in your web browser |
|
# --metadata FILENAME Path to JSON/YAML file containing license/source metadata |
|
datasette "${database}" -h 0.0.0.0 -p "${port}" --open \ |
|
--setting facet_time_limit_ms 20000 \ |
|
--setting facet_suggest_time_limit_ms 10000 \ |
|
--setting sql_time_limit_ms 20000 \ |
|
--setting max_returned_rows 100000 \ |
|
--setting truncate_cells_html 500 \ |
|
$( [[ -f "${HOME}/.csv2datasetterc" ]] && { while read p; do echo $p; done < "${HOME}/.csv2datasetterc"; } ; ) \ |
|
--metadata "${tempdir}/metadata.yml" \ |
|
--static static:${tempdir}/static/ |
|
|
|
|
|
# ### |
|
# Delete database |
|
# |
|
log "Deleting all files and directories which where created in ${tempdir}..." |
|
rm -r "${tempdir}" |
|
#trap "rm -rf ${tempdir}" EXIT |