pdfformfill/readformdata.py

177 lines
6.2 KiB
Python
Executable file

#! /usr/bin/python3
# -*- coding: UTF-8 -*-
"""Functions for getting data from files.
If started as the main file, ask the user what the fields in
the pdf entered on the command line are.
"""
import subprocess as cmd
import argparse
from formfield import FormField
from constants import ConfigError, LISTINFOS, COMMENT_SYMBOL
def listFields(fields, newfile): # NOQA
"""Use pdftk dump_data_fields output or a file to add to a data fields list.
Lines starting with # will be ignored (comments).
(# must be the first non-blank symbol in the line!)
Attribute:
fields [FormField]: list of FormFields to add to
-> will be changed and returned
newfile (str): pdf file or form-file with the form (data)
If newfile is .pdf, use pdftk dump_data_fields_utf8
If newfile is other, read as if it was output of pdftk dump_data_fields
Results:
list of all form fields. Each field is a dictionary of information.
Typically those are:
FieldType
FieldName
FieldNameAlt
FieldFlags
FieldJustification
FieldStateOption (list of several options)
Raises
------
ValueError: if a line cannot be parsed
some io error if file does not exist or pdftk has problems
"""
if newfile.endswith(".pdf"):
dump = cmd.Popen(["pdftk", newfile, "dump_data_fields_utf8"],
stdout=cmd.PIPE,
universal_newlines=True)
field_desc, _ = dump.communicate()
else:
with open(newfile) as inputfile:
field_desc = inputfile.read()
field_desc = field_desc.splitlines()
# stdout=sp.PIPE
# field_desc, _ = dump.communicate() # second result would be error
# fields = []
# includes both inputs: pdftk and file
field_desc = [l for l in field_desc if l.strip() != "" and not
l.strip().startswith(COMMENT_SYMBOL)]
if field_desc[0] != "---":
raise ConfigError("Output of pdftk dump_data_fields should start " +
"with ---. (Config files as well.)")
# ignore empty lines and comment lines
field_desc.append("---") # to include last FormField
field = FormField()
for line in field_desc[1:]: # ignore first ---
if line == "---":
if len(field) < 0:
raise ValueError("Several --- following each other.")
# else add:
for f in fields:
try:
if f["FieldName"] == field["FieldName"]:
f.merge(field)
break
except KeyError as ke:
raise KeyError("Some field has no 'FieldName': "
+ str(ke))
else:
# no field for merging found
fields.append(field)
field = FormField() # start new Field
else:
line = line.split(": ", maxsplit=1)
# try:
# line[1] = FormField.convertUmlauts(line[1])
# except IndexError:
if len(line) == 1:
raise ConfigError("The line '" + str(line) +
"' cannot be parsed. " +
"Apparently there is no ': '.")
if line[0] in field: # several entries
if line[0] in LISTINFOS:
field[line[0]].append(line[1])
else:
raise ConfigError("The information " + line[0] +
" appeared twice in one field.")
else:
if line[0] in LISTINFOS:
field[line[0]] = [line[1]]
else:
field[line[0]] = line[1]
return fields
def writeFields(fields, formfile, onlyNonEmpty=True):
"""Write the data saved in fields to formfile.
Attributes:
onlyNonEmpty (Bool): True=ignore fields that only have the information
that comes directly from the pdf. (i.e. all infos begin with "Field".)
"""
with open(formfile, mode="w") as outfile:
for field in fields:
if not (field.essentiallyempty() and onlyNonEmpty):
outfile.write("\n---\n")
outfile.write(str(field))
def parse():
"""Parse the arguments.
Results:
input pdf file (["pdffile"]
output info file (["output"])
"""
parser = argparse.ArgumentParser(
description="Go through the list of fields of a pdf form" +
" and ask for each one for a helpful description.")
parser.add_argument("pdffile",
help="The pdf file with the form fields.")
parser.add_argument("output", default="form.form",
help="The file that includes all the information " +
"collected in this script.\n" +
"If it exists already, add to the information saved.")
parser.add_argument("-u", "--update", help="Also ask for information " +
"about fields that already have information stored.",
action="store_true")
return vars(parser.parse_args())
def overviewFields(fields):
"""Create a list of FieldName:goodName.
Can be written to a file to help someone keeping the overview while
writing the config for a form.
"""
return "\n".join(["'" + f["FieldName"] + "' : " + f["Name"] for f in fields
if "Name" in f])
if __name__ == "__main__":
# start
args = parse()
formfields = listFields([], args["pdffile"])
try:
formfields = listFields(formfields, args["output"])
# otherwise nothing to be added
except FileNotFoundError:
pass
for fi in formfields:
try:
fi.askUser(args["pdffile"], update=args["update"])
except (KeyboardInterrupt, EOFError):
break
writeFields(formfields, args["output"])
with open("ov_" + args["output"], mode="w") as outputfile:
outputfile.write(overviewFields(formfields))
# next todos:
# • evince should display the right page somehow, maybe give the user
# the possibility to say "this was on page 2, assume next one is on page 2
# as well"