restructering: FormField on its own while reading/ writing config data in readformdata.py

This commit is contained in:
Bela 2018-02-15 21:43:25 +01:00
parent 90eb325464
commit 0520c310e8
2 changed files with 186 additions and 150 deletions

View file

@ -1,21 +1,24 @@
#! /usr/bin/python3
# -*- coding: UTF-8 -*-
"""Help decrypting the field names in pdf files.
A script that goes through the list of pdf form fields and asks the
FormFields model all information existing about fields in pdf forms.
Include a method to go through the list of pdf form fields and ask the
user for suitable descriptions.
The information stored for the fields are:
name
long description
standard value
special (hardcoded) treatment?, maybe code that tells the code how to use it
information from the pdf file
Name
Description
commands (as in config files)
"""
# todo: Button Varianten sind Off, On, sollte anpassbar/ automatisch
# angepasst werden
# todo: disable focus on pdf - apparently not simple :(
import argparse
import subprocess as cmd
import os.path
import fdfgen
@ -228,27 +231,28 @@ class FormField():
# the errors to show
print("The current field is called " + self["FieldName"])
def buttonmessage():
"""Show a special message for buttons.
Explains Button options.
"""
if self["FieldType"] == "Button":
print("It is a butten.",
"'X', 'On', 'Yes' all say 'cross it'",
" while '_', 'Off', 'No' all say",
"'do not make a cross here'")
# def buttonmessage():
# """Show a special message for buttons.
#
# Explains Button options.
#
# """
# if self["FieldType"] == "Button":
# print("It is a butten.",
# "'X', 'On', 'Yes' all say 'cross it'",
# " while '_', 'Off', 'No' all say",
# "'do not make a cross here'")
for message, info, specialmessage, converting in [
("Descriptive name", "Name", lambda: None,
lambda x: x),
("Long description", "Description", lambda: None,
lambda x: x),
("Standard value", "Stdvalue", buttonmessage,
self.convertToBool),
("Special handling", "Special", lambda: None,
lambda x: x)]:
# ("Standard value", "Stdvalue", buttonmessage,
# self.convertToBool),
# ("Special handling", "Special", lambda: None,
# lambda x: x)
]:
if info in self:
print("Value now:", self[info])
print("Enter nothing and this value is used.")
@ -305,132 +309,3 @@ class FormField():
for info, value in otherField:
# info does not exist in self or is equal
self[info] = value
def parse():
"""Parse the arguments.
Results:
input pdf file (["pdffile"]
output info file (["output"])
"""
parser = argparse.ArgumentParser(
description="Go through the list of fields of a pdf form" +
" and ask for each one for a helpful description.")
parser.add_argument("pdffile",
help="The pdf file with the form fields.")
parser.add_argument("output", default="form.form",
help="The file that includes all the information " +
"collected in this script.")
parser.add_argument("-u", "--update", help="Also ask for information " +
"about fields that already have information stored.",
action="store_true")
return vars(parser.parse_args())
def listFields(pdf_file, fieldListFile): # NOQA
"""Use pdftk dump_data_fields to generate a list of all data fields.
Attribute:
pdf_file (str): pdf file with the form
fieldListFile (str): file with same format with some data already
entered
Results:
list of all form fields. Each field is a dictionary of information.
Typically those are:
FieldType
FieldName
FieldNameAlt
FieldFlags
FieldJustification
FieldStateOption (list of several options)
Raises
------
ValueError: if a line cannot be parsed
some io error if file does not exist or pdftk has problems
"""
dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
universal_newlines=True)
# stdout=sp.PIPE
# field_desc, _ = dump.communicate() # second result would be error
fields = []
field_desc = dump.communicate()[0].splitlines()
# includes both inputs: pdftk and file
try:
data = open(fieldListFile)
except FileNotFoundError:
pass
else:
field_desc += data.read().splitlines()
data.close()
field = FormField()
if field_desc[0] != "---":
raise ValueError("Output of pdftk dump_data_fields should start " +
"with ---.")
field_desc = [l for l in field_desc if l.strip() != ""]
field_desc.append("---")
for line in field_desc[1:]: # ignore first ---
if line == "---":
if len(field) < 0:
raise ValueError("Several --- following each other.")
# else add:
for f in fields:
try:
if f["FieldName"] == field["FieldName"]:
f.merge(field)
break
except KeyError as ke:
raise KeyError("Some field has no 'FieldName': "
+ str(ke))
else:
# no field for merging found
fields.append(field)
field = FormField() # start new Field
else:
line = line.split(": ", maxsplit=1)
try:
line[1] = FormField.convertUmlauts(line[1])
except IndexError:
raise ValueError("The line '" + str(line) +
"' cannot be parsed. " +
"Apparently there is no ': '.")
if line[0] in field: # several entries
if line[0] in FormField.LISTINFOS:
field[line[0]].append(line[1])
else:
raise ValueError("The information " + line[0] +
"appeared twice in one field.")
else:
if line[0] in FormField.LISTINFOS:
field[line[0]] = [line[1]]
else:
field[line[0]] = line[1]
return fields
if __name__ == "__main__":
# start
args = parse()
formfields = listFields(args["pdffile"], args["output"])
for fi in formfields:
continu = fi.askUser(args["pdffile"], update=args["update"])
if not continu:
break
with open(args["output"], mode="w") as outputfile:
for fi in formfields:
if not fi.essentiallyempty():
outputfile.write("\n---\n")
outputfile.write(str(fi))
# next todos:
# • add possibility to abort this information input
# • evince should display the right page somehow, maybe give the user
# the possibility to say "this was on page 2, assume next one is on page 2
# as well"
# • save those information in a suitable file
# • read from this file and do not ask for things that are already
# saved in this file

161
readformdata.py Normal file
View file

@ -0,0 +1,161 @@
#! /usr/bin/python3
# -*- coding: UTF-8 -*-
"""Functions for getting data from files.
Constants:
COMMENT_SYMBOL = "#"
TODO:
part with FormField.LISTINFOS feels like hardcoded hack.
"""
import subprocess as cmd
import argparse
from formfield import FormField
COMMENT_SYMBOL = "#"
def listFields(fields, newfile): # NOQA
"""Use pdftk dump_data_fields output to add to a list of data fields.
Lines starting with # will be ignored (comments).
(# must be the first non-blank symbol in the line!)
Attribute:
fields [FormField]: list of FormFields to add to
-> will be changed and returned
newfile (str): pdf file or form-file with the form (data)
If newfile is .pdf, use pdftk dump_data_fields
If newfile is other, read as if it was output of pdftk dump_data_fields
Results:
list of all form fields. Each field is a dictionary of information.
Typically those are:
FieldType
FieldName
FieldNameAlt
FieldFlags
FieldJustification
FieldStateOption (list of several options)
Raises
------
ValueError: if a line cannot be parsed
some io error if file does not exist or pdftk has problems
"""
if newfile.endswith(".pdf"):
dump = cmd.Popen(["pdftk", newfile, "dump_data_fields"],
stdout=cmd.PIPE,
universal_newlines=True)
field_desc, _ = dump.communicate()
else:
with open(newfile) as inputfile:
field_desc = inputfile.read()
field_desc = field_desc.splitlines()
# stdout=sp.PIPE
# field_desc, _ = dump.communicate() # second result would be error
# fields = []
# includes both inputs: pdftk and file
field_desc = [l for l in field_desc if l.strip() != "" and not
l.strip().startswith(COMMENT_SYMBOL)]
if field_desc[0] != "---":
raise ValueError("Output of pdftk dump_data_fields should start " +
"with ---. (Config files as well.)")
# ignore empty lines and comment lines
field_desc.append("---") # to include last FormField
field = FormField()
for line in field_desc[1:]: # ignore first ---
if line == "---":
if len(field) < 0:
raise ValueError("Several --- following each other.")
# else add:
for f in fields:
try:
if f["FieldName"] == field["FieldName"]:
f.merge(field)
break
except KeyError as ke:
raise KeyError("Some field has no 'FieldName': "
+ str(ke))
else:
# no field for merging found
fields.append(field)
field = FormField() # start new Field
else:
line = line.split(": ", maxsplit=1)
try:
line[1] = FormField.convertUmlauts(line[1])
except IndexError:
raise ValueError("The line '" + str(line) +
"' cannot be parsed. " +
"Apparently there is no ': '.")
if line[0] in field: # several entries
if line[0] in FormField.LISTINFOS:
field[line[0]].append(line[1])
else:
raise ValueError("The information " + line[0] +
"appeared twice in one field.")
else:
if line[0] in FormField.LISTINFOS:
field[line[0]] = [line[1]]
else:
field[line[0]] = line[1]
return fields
def parse():
"""Parse the arguments.
Results:
input pdf file (["pdffile"]
output info file (["output"])
"""
parser = argparse.ArgumentParser(
description="Go through the list of fields of a pdf form" +
" and ask for each one for a helpful description.")
parser.add_argument("pdffile",
help="The pdf file with the form fields.")
parser.add_argument("output", default="form.form",
help="The file that includes all the information " +
"collected in this script.")
parser.add_argument("-u", "--update", help="Also ask for information " +
"about fields that already have information stored.",
action="store_true")
return vars(parser.parse_args())
def overviewFields(fields):
"""Create a list of FieldName:goodName.
Can be written to a file to help someone keeping the overview while
writing the config for a form.
"""
return "\n".join(["'" + f["FieldName"] + "' : " + f["Name"] for f in fields
if "Name" in f])
if __name__ == "__main__":
# start
args = parse()
formfields = listFields([], args["pdffile"])
formfields = listFields(formfields, args["output"])
for fi in formfields:
continu = fi.askUser(args["pdffile"], update=args["update"])
if not continu:
break
with open(args["output"], mode="w") as outputfile:
for fi in formfields:
if not fi.essentiallyempty():
outputfile.write("\n---\n")
outputfile.write(str(fi))
with open("ov_" + args["output"], mode="w") as outputfile:
outputfile.write(overviewFields(formfields))
# next todos:
# • evince should display the right page somehow, maybe give the user
# the possibility to say "this was on page 2, assume next one is on page 2
# as well"