read form fields from pdftk output

This commit is contained in:
Bela 2018-02-07 16:27:15 +01:00
parent dc3ca010de
commit af3bd3932f

100
identify.py Executable file
View file

@ -0,0 +1,100 @@
#! /usr/bin/python3
"""Help decrypting the field names in pdf files.
A script that goes through the list of pdf form fields and asks the
user for suitable descriptions.
"""
import argparse
import subprocess as cmd
# import fdfgen
def parse():
"""
Parse the arguments.
Results:
...
"""
parser = argparse.ArgumentParser(
description="Go through the list of fields of a pdf form" +
" and ask for each one for a helpful description.")
parser.add_argument("pdffile",
help="The pdf file with the form fields.")
parser.add_argument("output", default="form.form",
help="The file that includes all the information " +
"collected in this script.")
return vars(parser.parse_args())
def listFields(pdf_file):
"""Use pdftk dump_data_fields to generate a list of all data fields.
Attribute:
pdf_file (str): pdf file with the form
Results:
list of all form fields. Each field is a dictionary of information.
Typically those are:
FieldType
FieldName
FieldNameAlt
FieldFlags
FieldJustification
FieldStateOption (list of several options)
Raises
------
ValueError: if a line cannot be parsed
some io error if file does not exist or pdftk has problems
"""
dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
universal_newlines=True)
# stdout=sp.PIPE
field_desc, _ = dump.communicate() # second result would be error
fields = []
field = {}
field_desc = field_desc.splitlines()
if field_desc[0] != "---":
raise ValueError("Output of pdftk dump_data_fields should start " +
"with ---.")
for line in field_desc[1:]: # ignore first ---
if line == "---":
fields.append(field)
field = {}
else:
line = line.split(": ", maxsplit=1)
if len(line) != 2:
raise ValueError("The line '" + line +
"' cannot be parsed. " +
"Apparently there is no ': '.")
if line[0] in field: # several entries
try: # works if it is already a list
field[line[0]].append(line[1])
except AttributeError:
field[line[0]] = [field[line[0]], line[1]]
else: # information is new
field[line[0]] = line[1]
if len(field) == 0:
# pdftk output ended with ---
raise ValueError("Output of pdftk dump_data_fields should end " +
"with a field.")
else:
fields.append(field)
return fields
if __name__ == "__main__":
# start
args = parse()
formfields = listFields(args["pdffile"])
# next todos:
# • ask user about a single field by creating a pdf that has something
# written in this field
# • ask user for all fields
# • save those information in a suitable file
# • read from this file and do not ask for things that are already
# saved in this file