2018-02-07 16:27:15 +01:00
|
|
|
#! /usr/bin/python3
|
|
|
|
"""Help decrypting the field names in pdf files.
|
|
|
|
|
|
|
|
A script that goes through the list of pdf form fields and asks the
|
|
|
|
user for suitable descriptions.
|
2018-02-08 11:08:38 +01:00
|
|
|
|
|
|
|
The information stored for the fields are:
|
|
|
|
name
|
|
|
|
long description
|
|
|
|
standard value
|
|
|
|
special (hardcoded) treatment?, maybe code that tells the code how to use it
|
2018-02-07 16:27:15 +01:00
|
|
|
"""
|
|
|
|
|
2018-02-15 01:02:05 +01:00
|
|
|
# todo: Button Varianten sind Off, On, sollte anpassbar/ automatisch
|
|
|
|
# angepasst werden
|
2018-02-15 20:41:35 +01:00
|
|
|
# todo: disable focus on pdf - apparently not simple :(
|
2018-02-15 01:02:05 +01:00
|
|
|
|
2018-02-07 16:27:15 +01:00
|
|
|
import argparse
|
|
|
|
import subprocess as cmd
|
2018-02-08 11:08:38 +01:00
|
|
|
import os.path
|
|
|
|
import fdfgen
|
|
|
|
|
|
|
|
|
|
|
|
class FormField():
|
|
|
|
"""
|
|
|
|
Store all information for one field.
|
|
|
|
|
2018-02-15 01:02:05 +01:00
|
|
|
They can be accessed via [...]
|
|
|
|
(via __getitem__).
|
|
|
|
|
2018-02-08 11:08:38 +01:00
|
|
|
Those are:
|
2018-02-15 01:02:05 +01:00
|
|
|
information from dump_data_fields ("Field...")
|
|
|
|
informative name (["Name"])
|
|
|
|
long description (["Description"])
|
|
|
|
standard value (["Stdvalue"]) todo: how to implement that?
|
|
|
|
something about a special treatment (["Special"])
|
2018-02-08 11:08:38 +01:00
|
|
|
|
2018-02-15 20:41:35 +01:00
|
|
|
Constants:
|
|
|
|
LISTINFOS: those infos can appear several times, hence store a list.
|
|
|
|
|
2018-02-08 11:08:38 +01:00
|
|
|
"""
|
|
|
|
|
2018-02-15 20:41:35 +01:00
|
|
|
LISTINFOS = ["FieldStateOption"]
|
|
|
|
|
2018-02-15 01:02:05 +01:00
|
|
|
def convertToBool(self, possBool):
|
|
|
|
"""Convert value to True or or False if this field is a Checkbox.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
possBool if this is no Checkbox or it is unclear of True or False is
|
|
|
|
meant. Otherwise True or False
|
|
|
|
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
if self["FieldType"] == "Button":
|
|
|
|
if possBool.lower() in ["yes", "on", "x", "true"]:
|
|
|
|
return True
|
|
|
|
elif possBool.lower() in ["no", "_", "off", "false"]:
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return possBool
|
|
|
|
else:
|
|
|
|
return possBool
|
|
|
|
except AttributeError:
|
|
|
|
# lower probably did not work:
|
|
|
|
return possBool
|
|
|
|
except KeyError:
|
|
|
|
# "FieldType" does not exist
|
|
|
|
# maybe something more intelligent necessary
|
|
|
|
return possBool
|
|
|
|
|
2018-02-15 20:41:35 +01:00
|
|
|
@staticmethod
|
|
|
|
def convertUmlauts(value):
|
|
|
|
"""Convert all html-Syntax-Umlauts to the correct symbols.
|
|
|
|
|
|
|
|
Necessary to make fdfgen generate files that are read correctly
|
|
|
|
by pdftk.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
value, &#number replaced by the correct unicode
|
|
|
|
|
|
|
|
"""
|
|
|
|
for entity, uni in [("Ä", "Ä"),
|
|
|
|
("Ö", "Ö"),
|
|
|
|
("Ü", "Ü"),
|
|
|
|
("ß", "ß"),
|
|
|
|
("ä", "ä"),
|
|
|
|
("ö", "ö"),
|
|
|
|
("ü", "ü")]:
|
|
|
|
value = value.replace(entity, uni)
|
|
|
|
return value
|
|
|
|
|
2018-02-08 11:08:38 +01:00
|
|
|
def __init__(self):
|
|
|
|
"""
|
|
|
|
Initialise a FormField.
|
|
|
|
|
|
|
|
With an empty dictionary for pdfinfo
|
|
|
|
and None for the other values.
|
2018-02-15 20:41:35 +01:00
|
|
|
|
2018-02-08 11:08:38 +01:00
|
|
|
"""
|
2018-02-15 01:02:05 +01:00
|
|
|
self.__pdfinfo = {}
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
"""Get an information about this field.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
self[key]
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
KeyError: as usual if key is not a valid key
|
|
|
|
|
|
|
|
"""
|
|
|
|
if key in self:
|
|
|
|
assert self.__pdfinfo[key] is not None
|
|
|
|
return self.__pdfinfo[key]
|
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
|
|
|
"""Set an information about this field.
|
|
|
|
|
|
|
|
Do: self[key] = value.
|
|
|
|
|
|
|
|
If value is None, the key is removed.
|
|
|
|
|
|
|
|
"""
|
|
|
|
if value is None:
|
|
|
|
try:
|
|
|
|
del self.__pdfinfo[key]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
# todo: add warning in some log file
|
|
|
|
else:
|
|
|
|
self.__pdfinfo[key] = value
|
|
|
|
|
|
|
|
def __delitem__(self, key):
|
|
|
|
"""Delete an information from this field.
|
|
|
|
|
|
|
|
Do: del self.__pdfinfo[key]
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
KeyError: as usual for dicts
|
|
|
|
"""
|
|
|
|
del self.__pdfinfo[key]
|
|
|
|
|
|
|
|
def __contains__(self, item):
|
|
|
|
"""Return if item is a valid info."""
|
|
|
|
return item in self.__pdfinfo
|
|
|
|
|
|
|
|
def expanditer(self):
|
|
|
|
"""Return an iterator that iterates over all information.
|
2018-02-08 11:08:38 +01:00
|
|
|
|
2018-02-15 01:02:05 +01:00
|
|
|
Yielding those having lists once for each element.
|
|
|
|
|
|
|
|
For each information the name and the value in a tuple
|
|
|
|
is returned.
|
2018-02-08 11:08:38 +01:00
|
|
|
"""
|
2018-02-15 01:02:05 +01:00
|
|
|
for info in self.__pdfinfo:
|
|
|
|
# strings behave like lists, hence I cannot identify a
|
|
|
|
# difference
|
|
|
|
if str(self[info]) == self[info]:
|
|
|
|
yield (info, self[info])
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
for oneinfo in self[info]:
|
|
|
|
yield (info, oneinfo)
|
|
|
|
except TypeError:
|
|
|
|
# e.g. bool is not iterable
|
|
|
|
yield (info, self[info])
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
"""Return all info:value pairs.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
iter(self.__pdfinfo.items()).
|
|
|
|
|
|
|
|
"""
|
|
|
|
return iter(self.__pdfinfo.items())
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""Return number of information(s)."""
|
|
|
|
return len(self.__pdfinfo)
|
|
|
|
|
|
|
|
def essentiallyempty(self):
|
|
|
|
"""Return if any information from the user is stored.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
True if any info starts not with "Field"
|
|
|
|
|
|
|
|
"""
|
|
|
|
return all([info.startswith("Field") for info in self.__pdfinfo])
|
|
|
|
|
|
|
|
def askUser(self, origpdf, update=False):
|
|
|
|
"""Ask the user for information about this form field.
|
2018-02-08 11:08:38 +01:00
|
|
|
|
|
|
|
Create a pdf where only this field is filled with some data
|
|
|
|
such that the user can easily identify which field we are
|
|
|
|
talking about.
|
|
|
|
Doing this a file origpdf-tmpfilled.pdf and field-tmp.fdf are created.
|
|
|
|
It is not checked if this is a problem.
|
2018-02-15 01:02:05 +01:00
|
|
|
|
|
|
|
Attributes:
|
|
|
|
origpdf (str): path to the pdf file that all is about
|
|
|
|
update (bool): if fields that have an information should be queried
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
if the user wants to abort this and considers her/himself done
|
2018-02-08 11:08:38 +01:00
|
|
|
"""
|
2018-02-15 01:02:05 +01:00
|
|
|
if update or self.essentiallyempty():
|
|
|
|
# no data there or should be updated
|
|
|
|
# all information from dump_data_fields start with Field
|
|
|
|
try:
|
|
|
|
fdfinfo = [(self["FieldName"], "On")]
|
|
|
|
except KeyError:
|
|
|
|
raise ValueError("This field does not have a FieldName. This" +
|
|
|
|
" should not be possible in a pdf.")
|
|
|
|
else:
|
|
|
|
fdf = fdfgen.forge_fdf("", fdfinfo, [], [], [])
|
|
|
|
with open("field-tmp.fdf", "wb") as fdf_file: # includes close
|
|
|
|
fdf_file.write(fdf)
|
|
|
|
tmppdf = os.path.basename(origpdf) + "-tmpfilled.pdf"
|
|
|
|
pdfcreation = cmd.Popen(["pdftk", origpdf,
|
|
|
|
"fill_form", "field-tmp.fdf", "output",
|
|
|
|
tmppdf], stderr=cmd.PIPE)
|
|
|
|
# those pdfs are pseudo-encrypted. That makes pdftk issue
|
|
|
|
# a warning that one should respect copyright.
|
|
|
|
pdfcreation.wait(timeout=5) # wait a maximum of 5 seconds
|
|
|
|
# for pdftk to finish
|
|
|
|
# then evince can work
|
|
|
|
cmd.Popen(["evince", tmppdf], stderr=cmd.PIPE) # do not need
|
|
|
|
# the errors to show
|
|
|
|
print("The current field is called " + self["FieldName"])
|
|
|
|
|
|
|
|
def buttonmessage():
|
|
|
|
"""Show a special message for buttons.
|
|
|
|
|
|
|
|
Explains Button options.
|
2018-02-15 20:41:35 +01:00
|
|
|
|
2018-02-15 01:02:05 +01:00
|
|
|
"""
|
|
|
|
if self["FieldType"] == "Button":
|
|
|
|
print("It is a butten.",
|
|
|
|
"'X', 'On', 'Yes' all say 'cross it'",
|
|
|
|
" while '_', 'Off', 'No' all say",
|
|
|
|
"'do not make a cross here'")
|
|
|
|
|
|
|
|
for message, info, specialmessage, converting in [
|
|
|
|
("Descriptive name", "Name", lambda: None,
|
|
|
|
lambda x: x),
|
|
|
|
("Long description", "Description", lambda: None,
|
|
|
|
lambda x: x),
|
|
|
|
("Standard value", "Stdvalue", buttonmessage,
|
|
|
|
self.convertToBool),
|
|
|
|
("Special handling", "Special", lambda: None,
|
|
|
|
lambda x: x)]:
|
|
|
|
if info in self:
|
|
|
|
print("Value now:", self[info])
|
|
|
|
print("Enter nothing and this value is used.")
|
|
|
|
specialmessage()
|
|
|
|
try:
|
|
|
|
newvalue = converting(input(message + ":"))
|
|
|
|
except EOFError:
|
|
|
|
# user hit Ctrl+D
|
|
|
|
return False
|
|
|
|
try:
|
|
|
|
if newvalue.strip() != "":
|
|
|
|
self[info] = newvalue
|
|
|
|
# otherwise take old value
|
|
|
|
except AttributeError:
|
|
|
|
# strip is not a method (not a string)
|
|
|
|
if newvalue is not None:
|
|
|
|
self[info] = newvalue
|
|
|
|
# otherwise take old value
|
|
|
|
|
|
|
|
return True
|
2018-02-08 11:08:38 +01:00
|
|
|
|
|
|
|
def __str__(self):
|
2018-02-15 01:02:05 +01:00
|
|
|
"""Return a several line long string with all info of this field.
|
|
|
|
|
|
|
|
Information that are missing are not displayed.
|
|
|
|
|
2018-02-08 11:08:38 +01:00
|
|
|
"""
|
2018-02-15 01:02:05 +01:00
|
|
|
return "\n".join([str(info) + ": " + str(value)
|
|
|
|
for info, value in self.expanditer()])
|
|
|
|
|
|
|
|
def merge(self, otherField):
|
|
|
|
"""Merge two fields.
|
|
|
|
|
|
|
|
Add the information from otherField that are not in self to self.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
self after adding stuff
|
|
|
|
Raises:
|
|
|
|
ValueError: if otherField and self disagree on a value.
|
|
|
|
! self might already have changed !
|
2018-02-08 11:08:38 +01:00
|
|
|
|
|
|
|
"""
|
2018-02-15 01:02:05 +01:00
|
|
|
for info, value in otherField:
|
|
|
|
# otherField.__iter__() is called
|
|
|
|
try:
|
|
|
|
if self[info] != value:
|
|
|
|
raise ValueError("The two fields disagree on " + info
|
|
|
|
+ ": '" + str(self[info]) + "' != '"
|
|
|
|
+ str(value) + "'")
|
|
|
|
except KeyError:
|
|
|
|
pass # ok, info does not exist in self
|
|
|
|
# else: do nothing, info exists already
|
|
|
|
|
|
|
|
for info, value in otherField:
|
|
|
|
# info does not exist in self or is equal
|
|
|
|
self[info] = value
|
2018-02-07 16:27:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
def parse():
|
2018-02-15 01:02:05 +01:00
|
|
|
"""Parse the arguments.
|
2018-02-07 16:27:15 +01:00
|
|
|
|
|
|
|
Results:
|
2018-02-15 01:02:05 +01:00
|
|
|
input pdf file (["pdffile"]
|
|
|
|
output info file (["output"])
|
2018-02-07 16:27:15 +01:00
|
|
|
|
|
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Go through the list of fields of a pdf form" +
|
|
|
|
" and ask for each one for a helpful description.")
|
|
|
|
parser.add_argument("pdffile",
|
|
|
|
help="The pdf file with the form fields.")
|
|
|
|
parser.add_argument("output", default="form.form",
|
|
|
|
help="The file that includes all the information " +
|
|
|
|
"collected in this script.")
|
2018-02-15 01:02:05 +01:00
|
|
|
parser.add_argument("-u", "--update", help="Also ask for information " +
|
|
|
|
"about fields that already have information stored.",
|
|
|
|
action="store_true")
|
2018-02-07 16:27:15 +01:00
|
|
|
return vars(parser.parse_args())
|
|
|
|
|
|
|
|
|
2018-02-15 01:02:05 +01:00
|
|
|
def listFields(pdf_file, fieldListFile): # NOQA
|
2018-02-07 16:27:15 +01:00
|
|
|
"""Use pdftk dump_data_fields to generate a list of all data fields.
|
|
|
|
|
|
|
|
Attribute:
|
|
|
|
pdf_file (str): pdf file with the form
|
2018-02-15 01:02:05 +01:00
|
|
|
fieldListFile (str): file with same format with some data already
|
|
|
|
entered
|
2018-02-07 16:27:15 +01:00
|
|
|
|
|
|
|
Results:
|
|
|
|
list of all form fields. Each field is a dictionary of information.
|
|
|
|
Typically those are:
|
|
|
|
FieldType
|
|
|
|
FieldName
|
|
|
|
FieldNameAlt
|
|
|
|
FieldFlags
|
|
|
|
FieldJustification
|
|
|
|
FieldStateOption (list of several options)
|
|
|
|
|
|
|
|
Raises
|
|
|
|
------
|
|
|
|
ValueError: if a line cannot be parsed
|
|
|
|
some io error if file does not exist or pdftk has problems
|
|
|
|
|
|
|
|
"""
|
|
|
|
dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
|
|
|
|
universal_newlines=True)
|
|
|
|
# stdout=sp.PIPE
|
2018-02-15 01:02:05 +01:00
|
|
|
# field_desc, _ = dump.communicate() # second result would be error
|
2018-02-07 16:27:15 +01:00
|
|
|
fields = []
|
2018-02-15 01:02:05 +01:00
|
|
|
field_desc = dump.communicate()[0].splitlines()
|
|
|
|
# includes both inputs: pdftk and file
|
|
|
|
try:
|
|
|
|
data = open(fieldListFile)
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
field_desc += data.read().splitlines()
|
|
|
|
data.close()
|
2018-02-08 11:08:38 +01:00
|
|
|
field = FormField()
|
2018-02-07 16:27:15 +01:00
|
|
|
if field_desc[0] != "---":
|
|
|
|
raise ValueError("Output of pdftk dump_data_fields should start " +
|
|
|
|
"with ---.")
|
2018-02-15 01:02:05 +01:00
|
|
|
field_desc = [l for l in field_desc if l.strip() != ""]
|
|
|
|
field_desc.append("---")
|
2018-02-07 16:27:15 +01:00
|
|
|
for line in field_desc[1:]: # ignore first ---
|
|
|
|
if line == "---":
|
2018-02-15 01:02:05 +01:00
|
|
|
if len(field) < 0:
|
|
|
|
raise ValueError("Several --- following each other.")
|
|
|
|
# else add:
|
|
|
|
for f in fields:
|
|
|
|
try:
|
|
|
|
if f["FieldName"] == field["FieldName"]:
|
|
|
|
f.merge(field)
|
|
|
|
break
|
|
|
|
except KeyError as ke:
|
|
|
|
raise KeyError("Some field has no 'FieldName': "
|
|
|
|
+ str(ke))
|
|
|
|
else:
|
|
|
|
# no field for merging found
|
|
|
|
fields.append(field)
|
|
|
|
field = FormField() # start new Field
|
2018-02-07 16:27:15 +01:00
|
|
|
else:
|
|
|
|
line = line.split(": ", maxsplit=1)
|
2018-02-15 20:41:35 +01:00
|
|
|
try:
|
|
|
|
line[1] = FormField.convertUmlauts(line[1])
|
|
|
|
except IndexError:
|
2018-02-15 01:02:05 +01:00
|
|
|
raise ValueError("The line '" + str(line) +
|
2018-02-07 16:27:15 +01:00
|
|
|
"' cannot be parsed. " +
|
|
|
|
"Apparently there is no ': '.")
|
2018-02-15 01:02:05 +01:00
|
|
|
if line[0] in field: # several entries
|
2018-02-15 20:41:35 +01:00
|
|
|
if line[0] in FormField.LISTINFOS:
|
2018-02-15 01:02:05 +01:00
|
|
|
field[line[0]].append(line[1])
|
2018-02-15 20:41:35 +01:00
|
|
|
else:
|
|
|
|
raise ValueError("The information " + line[0] +
|
|
|
|
"appeared twice in one field.")
|
|
|
|
else:
|
|
|
|
if line[0] in FormField.LISTINFOS:
|
|
|
|
field[line[0]] = [line[1]]
|
|
|
|
else:
|
|
|
|
field[line[0]] = line[1]
|
2018-02-15 01:02:05 +01:00
|
|
|
|
2018-02-07 16:27:15 +01:00
|
|
|
return fields
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# start
|
|
|
|
args = parse()
|
2018-02-15 01:02:05 +01:00
|
|
|
formfields = listFields(args["pdffile"], args["output"])
|
|
|
|
for fi in formfields:
|
|
|
|
continu = fi.askUser(args["pdffile"], update=args["update"])
|
|
|
|
if not continu:
|
|
|
|
break
|
|
|
|
with open(args["output"], mode="w") as outputfile:
|
|
|
|
for fi in formfields:
|
|
|
|
if not fi.essentiallyempty():
|
|
|
|
outputfile.write("\n---\n")
|
|
|
|
outputfile.write(str(fi))
|
2018-02-07 16:27:15 +01:00
|
|
|
# next todos:
|
2018-02-08 11:08:38 +01:00
|
|
|
# • add possibility to abort this information input
|
2018-02-08 11:10:40 +01:00
|
|
|
# • evince should display the right page somehow, maybe give the user
|
|
|
|
# the possibility to say "this was on page 2, assume next one is on page 2
|
|
|
|
# as well"
|
2018-02-07 16:27:15 +01:00
|
|
|
# • save those information in a suitable file
|
|
|
|
# • read from this file and do not ask for things that are already
|
|
|
|
# saved in this file
|