Created
May 27, 2019 21:19
-
-
Save rwirth/82bdd2925dc449666f168b0e4d175e57 to your computer and use it in GitHub Desktop.
Script to add bookmarks to a PDF file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
import io | |
import re | |
import PyPDF2.pdf as pdf | |
ChapterTreeNode = collections.namedtuple('ChapterTreeNode', 'name page children') | |
def read_bookmarks(bmfile, offset=1, strip=False): | |
with io.open(bmfile, encoding='utf8') as f: | |
bookmarks = [] | |
chapters = ChapterTreeNode(None, None, {}) | |
for line in f: | |
fields = line.split() | |
if len(fields) == 0: | |
continue | |
# Correction factor because PDF page indexing starts at zero and the offset is for page 1 | |
page = offset + int(fields[-1]) - 2 | |
# if first field looks like a chapter | |
if re.match(r'(\d+\.)*\d+', fields[0]) is not None: | |
if strip: | |
name = ' '.join(fields[1:-1]) | |
else: | |
name = ' '.join(fields[:-1]) | |
entry = ChapterTreeNode(name, page, {}) | |
chapternum = fields[0].split('.') | |
parent = chapters | |
for num in chapternum[:-1]: | |
try: | |
parent = parent.children[int(num)] | |
except KeyError: | |
print('Offending line: {}'.format(line)) | |
raise | |
parent.children[int(chapternum[-1])] = entry | |
if len(chapternum) == 1: | |
bookmarks.append(entry) | |
elif fields[0] == '>>': | |
if fields[1] == 'shift': | |
offset += int(fields[2]) | |
else: | |
raise RuntimeError('unknown command {}'.format(fields[1])) | |
else: | |
name = ' '.join(fields[:-1]) | |
entry = ChapterTreeNode(name, page, {}) | |
bookmarks.append(entry) | |
return bookmarks | |
def add_bookmarks(writer, bookmarks): | |
def _add(ctn, parent=None): | |
this = writer.addBookmark(ctn.name, ctn.page, parent=parent, italic=(parent is None and len(ctn.children) == 0)) | |
for child in sorted(ctn.children): | |
_add(ctn.children[child], parent=this) | |
for rootbm in bookmarks: | |
_add(rootbm, parent=None) | |
def _main(): | |
import argparse | |
import sys | |
parser = argparse.ArgumentParser(description='Add bookmarks to a PDF file.') | |
parser.add_argument('-f', '--offset', type=int, default=1, | |
help='Page number of page 1 in the file.') | |
parser.add_argument('-o', '--outfile', | |
help='Output file name. Writes to stdout if none given.') | |
parser.add_argument('-s', '--strip', action='store_true', | |
help='Strip chapter number from entry.') | |
parser.add_argument('pdf', help='Input PDF file.') | |
parser.add_argument('bookmarkfile', help='Bookmark file. Lines consist of ' | |
'optionally a chapter number x.y.z, followed by the title ' | |
'and the page number. Fields are separated by whitespace, ' | |
'all runs of whitespace are replaced by single spaces in ' | |
'the bookmarks. Upper levels must precede their children. ' | |
'Chapterless entries can only be at the top level. ' | |
'Encoding is UTF-8.') | |
args = parser.parse_args() | |
bookmarks = read_bookmarks(args.bookmarkfile, args.offset, args.strip) | |
pdffile = pdf.PdfFileReader(args.pdf) | |
writer = pdf.PdfFileWriter() | |
writer.cloneDocumentFromReader(pdffile) | |
# fake write to get a proper PDF | |
writer.write(io.BytesIO()) | |
add_bookmarks(writer, bookmarks) | |
output = io.BytesIO() | |
writer.write(output) | |
if args.outfile is not None: | |
with open(args.outfile, 'wb') as f: | |
f.write(output.getvalue()) | |
else: | |
sys.stdout.write(output.getvalue()) | |
output.close() | |
if __name__ == '__main__': | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment