pdfformfill/identify.py

437 lines
15 KiB
Python
Raw Normal View History

2018-02-07 16:27:15 +01:00
#! /usr/bin/python3
"""Help decrypting the field names in pdf files.
A script that goes through the list of pdf form fields and asks the
user for suitable descriptions.
The information stored for the fields are:
name
long description
standard value
special (hardcoded) treatment?, maybe code that tells the code how to use it
2018-02-07 16:27:15 +01:00
"""
# todo: Button Varianten sind Off, On, sollte anpassbar/ automatisch
# angepasst werden
2018-02-15 20:41:35 +01:00
# todo: disable focus on pdf - apparently not simple :(
2018-02-07 16:27:15 +01:00
import argparse
import subprocess as cmd
import os.path
import fdfgen
class FormField():
"""
Store all information for one field.
They can be accessed via [...]
(via __getitem__).
Those are:
information from dump_data_fields ("Field...")
informative name (["Name"])
long description (["Description"])
standard value (["Stdvalue"]) todo: how to implement that?
something about a special treatment (["Special"])
2018-02-15 20:41:35 +01:00
Constants:
LISTINFOS: those infos can appear several times, hence store a list.
"""
2018-02-15 20:41:35 +01:00
LISTINFOS = ["FieldStateOption"]
def convertToBool(self, possBool):
"""Convert value to True or or False if this field is a Checkbox.
Returns:
possBool if this is no Checkbox or it is unclear of True or False is
meant. Otherwise True or False
"""
try:
if self["FieldType"] == "Button":
if possBool.lower() in ["yes", "on", "x", "true"]:
return True
elif possBool.lower() in ["no", "_", "off", "false"]:
return False
else:
return possBool
else:
return possBool
except AttributeError:
# lower probably did not work:
return possBool
except KeyError:
# "FieldType" does not exist
# maybe something more intelligent necessary
return possBool
2018-02-15 20:41:35 +01:00
@staticmethod
def convertUmlauts(value):
"""Convert all html-Syntax-Umlauts to the correct symbols.
Necessary to make fdfgen generate files that are read correctly
by pdftk.
Returns:
value, &#number replaced by the correct unicode
"""
for entity, uni in [("Ä", "Ä"),
("Ö", "Ö"),
("Ü", "Ü"),
("ß", "ß"),
("ä", "ä"),
("ö", "ö"),
("ü", "ü")]:
value = value.replace(entity, uni)
return value
def __init__(self):
"""
Initialise a FormField.
With an empty dictionary for pdfinfo
and None for the other values.
2018-02-15 20:41:35 +01:00
"""
self.__pdfinfo = {}
def __getitem__(self, key):
"""Get an information about this field.
Returns:
self[key]
Raises:
KeyError: as usual if key is not a valid key
"""
if key in self:
assert self.__pdfinfo[key] is not None
return self.__pdfinfo[key]
def __setitem__(self, key, value):
"""Set an information about this field.
Do: self[key] = value.
If value is None, the key is removed.
"""
if value is None:
try:
del self.__pdfinfo[key]
except KeyError:
pass
# todo: add warning in some log file
else:
self.__pdfinfo[key] = value
def __delitem__(self, key):
"""Delete an information from this field.
Do: del self.__pdfinfo[key]
Raises:
KeyError: as usual for dicts
"""
del self.__pdfinfo[key]
def __contains__(self, item):
"""Return if item is a valid info."""
return item in self.__pdfinfo
def expanditer(self):
"""Return an iterator that iterates over all information.
Yielding those having lists once for each element.
For each information the name and the value in a tuple
is returned.
"""
for info in self.__pdfinfo:
# strings behave like lists, hence I cannot identify a
# difference
if str(self[info]) == self[info]:
yield (info, self[info])
else:
try:
for oneinfo in self[info]:
yield (info, oneinfo)
except TypeError:
# e.g. bool is not iterable
yield (info, self[info])
def __iter__(self):
"""Return all info:value pairs.
Returns:
iter(self.__pdfinfo.items()).
"""
return iter(self.__pdfinfo.items())
def __len__(self):
"""Return number of information(s)."""
return len(self.__pdfinfo)
def essentiallyempty(self):
"""Return if any information from the user is stored.
Returns:
True if any info starts not with "Field"
"""
return all([info.startswith("Field") for info in self.__pdfinfo])
def askUser(self, origpdf, update=False):
"""Ask the user for information about this form field.
Create a pdf where only this field is filled with some data
such that the user can easily identify which field we are
talking about.
Doing this a file origpdf-tmpfilled.pdf and field-tmp.fdf are created.
It is not checked if this is a problem.
Attributes:
origpdf (str): path to the pdf file that all is about
update (bool): if fields that have an information should be queried
Returns:
if the user wants to abort this and considers her/himself done
"""
if update or self.essentiallyempty():
# no data there or should be updated
# all information from dump_data_fields start with Field
try:
fdfinfo = [(self["FieldName"], "On")]
except KeyError:
raise ValueError("This field does not have a FieldName. This" +
" should not be possible in a pdf.")
else:
fdf = fdfgen.forge_fdf("", fdfinfo, [], [], [])
with open("field-tmp.fdf", "wb") as fdf_file: # includes close
fdf_file.write(fdf)
tmppdf = os.path.basename(origpdf) + "-tmpfilled.pdf"
pdfcreation = cmd.Popen(["pdftk", origpdf,
"fill_form", "field-tmp.fdf", "output",
tmppdf], stderr=cmd.PIPE)
# those pdfs are pseudo-encrypted. That makes pdftk issue
# a warning that one should respect copyright.
pdfcreation.wait(timeout=5) # wait a maximum of 5 seconds
# for pdftk to finish
# then evince can work
cmd.Popen(["evince", tmppdf], stderr=cmd.PIPE) # do not need
# the errors to show
print("The current field is called " + self["FieldName"])
def buttonmessage():
"""Show a special message for buttons.
Explains Button options.
2018-02-15 20:41:35 +01:00
"""
if self["FieldType"] == "Button":
print("It is a butten.",
"'X', 'On', 'Yes' all say 'cross it'",
" while '_', 'Off', 'No' all say",
"'do not make a cross here'")
for message, info, specialmessage, converting in [
("Descriptive name", "Name", lambda: None,
lambda x: x),
("Long description", "Description", lambda: None,
lambda x: x),
("Standard value", "Stdvalue", buttonmessage,
self.convertToBool),
("Special handling", "Special", lambda: None,
lambda x: x)]:
if info in self:
print("Value now:", self[info])
print("Enter nothing and this value is used.")
specialmessage()
try:
newvalue = converting(input(message + ":"))
except EOFError:
# user hit Ctrl+D
return False
try:
if newvalue.strip() != "":
self[info] = newvalue
# otherwise take old value
except AttributeError:
# strip is not a method (not a string)
if newvalue is not None:
self[info] = newvalue
# otherwise take old value
return True
def __str__(self):
"""Return a several line long string with all info of this field.
Information that are missing are not displayed.
"""
return "\n".join([str(info) + ": " + str(value)
for info, value in self.expanditer()])
def merge(self, otherField):
"""Merge two fields.
Add the information from otherField that are not in self to self.
Returns:
self after adding stuff
Raises:
ValueError: if otherField and self disagree on a value.
! self might already have changed !
"""
for info, value in otherField:
# otherField.__iter__() is called
try:
if self[info] != value:
raise ValueError("The two fields disagree on " + info
+ ": '" + str(self[info]) + "' != '"
+ str(value) + "'")
except KeyError:
pass # ok, info does not exist in self
# else: do nothing, info exists already
for info, value in otherField:
# info does not exist in self or is equal
self[info] = value
2018-02-07 16:27:15 +01:00
def parse():
"""Parse the arguments.
2018-02-07 16:27:15 +01:00
Results:
input pdf file (["pdffile"]
output info file (["output"])
2018-02-07 16:27:15 +01:00
"""
parser = argparse.ArgumentParser(
description="Go through the list of fields of a pdf form" +
" and ask for each one for a helpful description.")
parser.add_argument("pdffile",
help="The pdf file with the form fields.")
parser.add_argument("output", default="form.form",
help="The file that includes all the information " +
"collected in this script.")
parser.add_argument("-u", "--update", help="Also ask for information " +
"about fields that already have information stored.",
action="store_true")
2018-02-07 16:27:15 +01:00
return vars(parser.parse_args())
def listFields(pdf_file, fieldListFile): # NOQA
2018-02-07 16:27:15 +01:00
"""Use pdftk dump_data_fields to generate a list of all data fields.
Attribute:
pdf_file (str): pdf file with the form
fieldListFile (str): file with same format with some data already
entered
2018-02-07 16:27:15 +01:00
Results:
list of all form fields. Each field is a dictionary of information.
Typically those are:
FieldType
FieldName
FieldNameAlt
FieldFlags
FieldJustification
FieldStateOption (list of several options)
Raises
------
ValueError: if a line cannot be parsed
some io error if file does not exist or pdftk has problems
"""
dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
universal_newlines=True)
# stdout=sp.PIPE
# field_desc, _ = dump.communicate() # second result would be error
2018-02-07 16:27:15 +01:00
fields = []
field_desc = dump.communicate()[0].splitlines()
# includes both inputs: pdftk and file
try:
data = open(fieldListFile)
except FileNotFoundError:
pass
else:
field_desc += data.read().splitlines()
data.close()
field = FormField()
2018-02-07 16:27:15 +01:00
if field_desc[0] != "---":
raise ValueError("Output of pdftk dump_data_fields should start " +
"with ---.")
field_desc = [l for l in field_desc if l.strip() != ""]
field_desc.append("---")
2018-02-07 16:27:15 +01:00
for line in field_desc[1:]: # ignore first ---
if line == "---":
if len(field) < 0:
raise ValueError("Several --- following each other.")
# else add:
for f in fields:
try:
if f["FieldName"] == field["FieldName"]:
f.merge(field)
break
except KeyError as ke:
raise KeyError("Some field has no 'FieldName': "
+ str(ke))
else:
# no field for merging found
fields.append(field)
field = FormField() # start new Field
2018-02-07 16:27:15 +01:00
else:
line = line.split(": ", maxsplit=1)
2018-02-15 20:41:35 +01:00
try:
line[1] = FormField.convertUmlauts(line[1])
except IndexError:
raise ValueError("The line '" + str(line) +
2018-02-07 16:27:15 +01:00
"' cannot be parsed. " +
"Apparently there is no ': '.")
if line[0] in field: # several entries
2018-02-15 20:41:35 +01:00
if line[0] in FormField.LISTINFOS:
field[line[0]].append(line[1])
2018-02-15 20:41:35 +01:00
else:
raise ValueError("The information " + line[0] +
"appeared twice in one field.")
else:
if line[0] in FormField.LISTINFOS:
field[line[0]] = [line[1]]
else:
field[line[0]] = line[1]
2018-02-07 16:27:15 +01:00
return fields
if __name__ == "__main__":
# start
args = parse()
formfields = listFields(args["pdffile"], args["output"])
for fi in formfields:
continu = fi.askUser(args["pdffile"], update=args["update"])
if not continu:
break
with open(args["output"], mode="w") as outputfile:
for fi in formfields:
if not fi.essentiallyempty():
outputfile.write("\n---\n")
outputfile.write(str(fi))
2018-02-07 16:27:15 +01:00
# next todos:
# • add possibility to abort this information input
2018-02-08 11:10:40 +01:00
# • evince should display the right page somehow, maybe give the user
# the possibility to say "this was on page 2, assume next one is on page 2
# as well"
2018-02-07 16:27:15 +01:00
# • save those information in a suitable file
# • read from this file and do not ask for things that are already
# saved in this file