Skip to content

Instantly share code, notes, and snippets.

@eggplants
Forked from sneakers-the-rat/clean_pdf.sh
Last active February 21, 2023 23:08
Show Gist options
  • Save eggplants/df608e8ced4e4d993bfe3031f34fcdea to your computer and use it in GitHub Desktop.
Save eggplants/df608e8ced4e4d993bfe3031f34fcdea to your computer and use it in GitHub Desktop.
Strip PDF Metadata (more improved version)
#!/usr/bin/env bash
set -euo pipefail
# Color Codes so that warnings/errors stick out
GREEN="\033[32m"
RED="\033[31m"
CLEAR="\033[0m"
USAGE="usage: $0 [<TARGET DIR>=.]"
_help(){
cat << HELP
$USAGE
# --------------------------------------------------------------------
# Recursively find pdfs from the directory given as the first argument,
# otherwise search the current directory.
# Use exiftool and qpdf (both must be installed and locatable on \$PATH)
# to strip all top-level metadata from PDFs.
#
# Note - This only removes file-level metadata, not any metadata
# in embedded images, etc.
#
# Code is provided as-is, I take no responsibility for its use,
# and I make no guarantee that this code works
# or makes your PDFs "safe," whatever that means to you.
#
# You may need to enable execution of this script before using,
# eg. chmod +x $0
#
# example:
# clean current directory:
# $ $0
#
# clean specific directory:
# $ $0 some/other/directory
# --------------------------------------------------------------------
HELP
}
_clean_info() {
local tmp target
if [ $# = 1 ]; then
target="$1"
else
return 1
fi
# output file as original filename with suffix _clean.pdf
tmp="${target%.*}_clean.pdf"
# remove the temporary file if it already exists
if [ -f "$tmp" ]; then
rm "$tmp"
fi
exiftool -q -all:all= "$target" -o "$tmp"
qpdf --linearize --replace-input "$tmp" || :
echo -e "=> ${GREEN}Processed${CLEAR}: ${GREEN}${tmp}${CLEAR}"
}
main() {
local tempfile f cnt
if ! command -v exiftool qpdf &> /dev/null; then
echo 'err: Please install: exiftool, qpdf' >&2
return 1
fi
# loop through all PDFs in first argument ($1),
# or use '.' (this directory) if not given
if [ $# -gt 1 ]; then
echo "$USAGE" >&2
return 1
elif [[ "$1" =~ ^-h(elp)?$ ]]; then
_help
return 0
fi
DIR="${1:-.}"
if ! [ -d "$DIR" ]; then
echo "err: Target dir '${DIR}' is not found." >&2
fi
echo "Cleaning PDFs in directory $DIR"
# use find to locate files, pip to while read to get the
# whole line instead of space delimited
# Note -- this will find pdfs recursively!!
tempfile="$(mktemp)"
find "$DIR" -type f -name "*.pdf" > "$tempfile"
file_num="$(wc -l < "$tempfile")"
cnt=1
while read -r f; do
echo -e "[${cnt}/${file_num}]: ${RED}${f}${CLEAR}"
_clean_info "$f"
((cnt++))
done < "$tempfile"
}
main "${@-}"
exit $?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment