Created
September 28, 2015 19:52
-
-
Save raphiz/3cd332d980d6f4e4cb9c to your computer and use it in GitHub Desktop.
PDF watermark removal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader, PdfFileWriter | |
from PyPDF2.pdf import ContentStream | |
from PyPDF2.generic import TextStringObject, NameObject | |
from PyPDF2.utils import b_ | |
wm_text = 'Persönliches Exemplar von' | |
replace_with = '' | |
# Load PDF into pyPDF | |
source = PdfFileReader(open('input.pdf', "rb")) | |
output = PdfFileWriter() | |
# For each page | |
for page in range(source.getNumPages()): | |
# Get the current page and it's contents | |
page = source.getPage(page) | |
content_object = page["/Contents"].getObject() | |
content = ContentStream(content_object, source) | |
# Loop over all pdf elements | |
for operands, operator in content.operations: | |
# You might adapt this part depending on your PDF file | |
if operator == b_("TJ"): | |
text = operands[0][0] | |
if isinstance(text, TextStringObject) and text.startswith(wm_text): | |
operands[0] = TextStringObject(replace_with) | |
# Set the modified content as content object on the page | |
page.__setitem__(NameObject('/Contents'), content) | |
# Add the page to the output | |
output.addPage(page) | |
# Write the stream | |
outputStream = open("output.pdf", "wb") | |
output.write(outputStream) |
It does not work for me either
ContentStream had an issue and I think they have removed that function
Just use PyPDF4 instead of PyPDF2
PyPDF4 is very outdated. The up-to-date repository is https://github.com/py-pdf/pypdf
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It doesn't work for me.