pdfformfill/readformdata.py
2018-03-14 18:29:27 +01:00

224 lines
7.2 KiB
Python

#! /usr/bin/python3
# -*- coding: UTF-8 -*-
"""Functions for getting data from files.
Constants:
COMMENT_SYMBOL = "#":
symbol to indicate a comment line
SUBINFO_SEP = "-":
symbol to separate subinformation, e.g. for commands
"""
import subprocess as cmd
import re
import argparse
import commands
from formfield import FormField
COMMENT_SYMBOL = "#"
SUBINFO_SEP = "-"
VARIABLE_BEGIN_SEP = "{"
VARIABLE_END_SEP = "}"
VARIABLE_PARTS_SEP = "|"
class ConfigError(Exception):
"""There are a lot of mistakes one can make in writing config files.
This Error indicates that some information is missing or some line is
malformed.
No __init__ supplied because the standard __init__ of Exception does
a good job.
"""
class MobileRegex():
"""A little util class.
To be used as a singleton.
Check whether an info should be saved as a list.
One could have a list of regex or just one regex but
then commands.Command.__subclasses__() would be a circular
import and that's bad.
"""
def __contains__(self, info):
"""Return whether this info can appear several times.
Attributes:
info: the info queried for.
"""
return (info in ["FieldStateOption"] or
info in [scls.__name__ for scls in
commands.Command.__subclasses__()] or
re.match(r"[^" + SUBINFO_SEP + r"]" +
SUBINFO_SEP + r"If.*", info))
# If clauses for commands
LISTINFOS = MobileRegex()
def listFields(fields, newfile): # NOQA
"""Use pdftk dump_data_fields output or a file to add to a data fields list.
Lines starting with # will be ignored (comments).
(# must be the first non-blank symbol in the line!)
Attribute:
fields [FormField]: list of FormFields to add to
-> will be changed and returned
newfile (str): pdf file or form-file with the form (data)
If newfile is .pdf, use pdftk dump_data_fields
If newfile is other, read as if it was output of pdftk dump_data_fields
Results:
list of all form fields. Each field is a dictionary of information.
Typically those are:
FieldType
FieldName
FieldNameAlt
FieldFlags
FieldJustification
FieldStateOption (list of several options)
Raises
------
ValueError: if a line cannot be parsed
some io error if file does not exist or pdftk has problems
"""
if newfile.endswith(".pdf"):
dump = cmd.Popen(["pdftk", newfile, "dump_data_fields"],
stdout=cmd.PIPE,
universal_newlines=True)
field_desc, _ = dump.communicate()
else:
with open(newfile) as inputfile:
field_desc = inputfile.read()
field_desc = field_desc.splitlines()
# stdout=sp.PIPE
# field_desc, _ = dump.communicate() # second result would be error
# fields = []
# includes both inputs: pdftk and file
field_desc = [l for l in field_desc if l.strip() != "" and not
l.strip().startswith(COMMENT_SYMBOL)]
if field_desc[0] != "---":
raise ValueError("Output of pdftk dump_data_fields should start " +
"with ---. (Config files as well.)")
# ignore empty lines and comment lines
field_desc.append("---") # to include last FormField
field = FormField()
for line in field_desc[1:]: # ignore first ---
if line == "---":
if len(field) < 0:
raise ValueError("Several --- following each other.")
# else add:
for f in fields:
try:
if f["FieldName"] == field["FieldName"]:
f.merge(field)
break
except KeyError as ke:
raise KeyError("Some field has no 'FieldName': "
+ str(ke))
else:
# no field for merging found
fields.append(field)
field = FormField() # start new Field
else:
line = line.split(": ", maxsplit=1)
try:
line[1] = FormField.convertUmlauts(line[1])
except IndexError:
raise ValueError("The line '" + str(line) +
"' cannot be parsed. " +
"Apparently there is no ': '.")
if line[0] in field: # several entries
if line[0] in LISTINFOS:
field[line[0]].append(line[1])
else:
raise ValueError("The information " + line[0] +
" appeared twice in one field.")
else:
if line[0] in LISTINFOS:
field[line[0]] = [line[1]]
else:
field[line[0]] = line[1]
return fields
def writeFields(fields, formfile, onlyNonEmpty=True):
"""Write the data saved in fields to formfile.
Attributes:
onlyNonEmpty (Bool): True=ignore fields that only have the information
that comes directly from the pdf. (i.e. all infos begin with "Field".)
"""
with open(formfile, mode="w") as outfile:
for field in fields:
if not (field.essentiallyempty() and onlyNonEmpty):
outfile.write("\n---\n")
outfile.write(str(field))
def parse():
"""Parse the arguments.
Results:
input pdf file (["pdffile"]
output info file (["output"])
"""
parser = argparse.ArgumentParser(
description="Go through the list of fields of a pdf form" +
" and ask for each one for a helpful description.")
parser.add_argument("pdffile",
help="The pdf file with the form fields.")
parser.add_argument("output", default="form.form",
help="The file that includes all the information " +
"collected in this script.")
parser.add_argument("-u", "--update", help="Also ask for information " +
"about fields that already have information stored.",
action="store_true")
return vars(parser.parse_args())
def overviewFields(fields):
"""Create a list of FieldName:goodName.
Can be written to a file to help someone keeping the overview while
writing the config for a form.
"""
return "\n".join(["'" + f["FieldName"] + "' : " + f["Name"] for f in fields
if "Name" in f])
if __name__ == "__main__":
# start
args = parse()
formfields = listFields([], args["pdffile"])
formfields = listFields(formfields, args["output"])
for fi in formfields:
continu = fi.askUser(args["pdffile"], update=args["update"])
if not continu:
break
writeFields(formfields, args["output"])
with open("ov_" + args["output"], mode="w") as outputfile:
outputfile.write(overviewFields(formfields))
# next todos:
# • evince should display the right page somehow, maybe give the user
# the possibility to say "this was on page 2, assume next one is on page 2
# as well"