read form fields from pdftk output
This commit is contained in:
parent
dc3ca010de
commit
af3bd3932f
1 changed files with 100 additions and 0 deletions
100
identify.py
Executable file
100
identify.py
Executable file
|
@ -0,0 +1,100 @@
|
|||
#! /usr/bin/python3
|
||||
"""Help decrypting the field names in pdf files.
|
||||
|
||||
A script that goes through the list of pdf form fields and asks the
|
||||
user for suitable descriptions.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import subprocess as cmd
|
||||
# import fdfgen
|
||||
|
||||
|
||||
def parse():
|
||||
"""
|
||||
Parse the arguments.
|
||||
|
||||
Results:
|
||||
...
|
||||
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Go through the list of fields of a pdf form" +
|
||||
" and ask for each one for a helpful description.")
|
||||
parser.add_argument("pdffile",
|
||||
help="The pdf file with the form fields.")
|
||||
parser.add_argument("output", default="form.form",
|
||||
help="The file that includes all the information " +
|
||||
"collected in this script.")
|
||||
return vars(parser.parse_args())
|
||||
|
||||
|
||||
def listFields(pdf_file):
|
||||
"""Use pdftk dump_data_fields to generate a list of all data fields.
|
||||
|
||||
Attribute:
|
||||
pdf_file (str): pdf file with the form
|
||||
|
||||
Results:
|
||||
list of all form fields. Each field is a dictionary of information.
|
||||
Typically those are:
|
||||
FieldType
|
||||
FieldName
|
||||
FieldNameAlt
|
||||
FieldFlags
|
||||
FieldJustification
|
||||
FieldStateOption (list of several options)
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError: if a line cannot be parsed
|
||||
some io error if file does not exist or pdftk has problems
|
||||
|
||||
"""
|
||||
dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
|
||||
universal_newlines=True)
|
||||
# stdout=sp.PIPE
|
||||
field_desc, _ = dump.communicate() # second result would be error
|
||||
fields = []
|
||||
field = {}
|
||||
field_desc = field_desc.splitlines()
|
||||
if field_desc[0] != "---":
|
||||
raise ValueError("Output of pdftk dump_data_fields should start " +
|
||||
"with ---.")
|
||||
for line in field_desc[1:]: # ignore first ---
|
||||
if line == "---":
|
||||
fields.append(field)
|
||||
field = {}
|
||||
else:
|
||||
line = line.split(": ", maxsplit=1)
|
||||
if len(line) != 2:
|
||||
raise ValueError("The line '" + line +
|
||||
"' cannot be parsed. " +
|
||||
"Apparently there is no ': '.")
|
||||
if line[0] in field: # several entries
|
||||
try: # works if it is already a list
|
||||
field[line[0]].append(line[1])
|
||||
except AttributeError:
|
||||
field[line[0]] = [field[line[0]], line[1]]
|
||||
else: # information is new
|
||||
field[line[0]] = line[1]
|
||||
if len(field) == 0:
|
||||
# pdftk output ended with ---
|
||||
raise ValueError("Output of pdftk dump_data_fields should end " +
|
||||
"with a field.")
|
||||
else:
|
||||
fields.append(field)
|
||||
return fields
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# start
|
||||
args = parse()
|
||||
formfields = listFields(args["pdffile"])
|
||||
# next todos:
|
||||
# • ask user about a single field by creating a pdf that has something
|
||||
# written in this field
|
||||
# • ask user for all fields
|
||||
# • save those information in a suitable file
|
||||
# • read from this file and do not ask for things that are already
|
||||
# saved in this file
|
Loading…
Reference in a new issue