read form fields from pdftk output

2018-02-07 16:27:15 +01:00 · 2018-02-07 16:27:15 +01:00 · af3bd3932f
commit af3bd3932f
parent dc3ca010de
1 changed files with 100 additions and 0 deletions
--- a/identify.py
+++ b/identify.py
@ -0,0 +1,100 @@
+#! /usr/bin/python3
+"""Help decrypting the field names in pdf files.
+
+A script that goes through the list of pdf form fields and asks the
+user for suitable descriptions.
+"""
+
+import argparse
+import subprocess as cmd
+# import fdfgen
+
+
+def parse():
+    """
+    Parse the arguments.
+
+    Results:
+        ...
+
+    """
+    parser = argparse.ArgumentParser(
+        description="Go through the list of fields of a pdf form" +
+        " and ask for each one for a helpful description.")
+    parser.add_argument("pdffile",
+                        help="The pdf file with the form fields.")
+    parser.add_argument("output", default="form.form",
+                        help="The file that includes all the information " +
+                        "collected in this script.")
+    return vars(parser.parse_args())
+
+
+def listFields(pdf_file):
+    """Use pdftk dump_data_fields to generate a list of all data fields.
+
+    Attribute:
+        pdf_file (str): pdf file with the form
+
+    Results:
+        list of all form fields. Each field is a dictionary of information.
+        Typically those are:
+            FieldType
+            FieldName
+            FieldNameAlt
+            FieldFlags
+            FieldJustification
+            FieldStateOption (list of several options)
+
+    Raises
+    ------
+        ValueError: if a line cannot be parsed
+        some io error if file does not exist or pdftk has problems
+
+    """
+    dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
+                     universal_newlines=True)
+    # stdout=sp.PIPE
+    field_desc, _ = dump.communicate()  # second result would be error
+    fields = []
+    field = {}
+    field_desc = field_desc.splitlines()
+    if field_desc[0] != "---":
+        raise ValueError("Output of pdftk dump_data_fields should start " +
+                         "with ---.")
+    for line in field_desc[1:]:  # ignore first ---
+        if line == "---":
+            fields.append(field)
+            field = {}
+        else:
+            line = line.split(": ", maxsplit=1)
+            if len(line) != 2:
+                raise ValueError("The line '" + line +
+                                 "' cannot be parsed. " +
+                                 "Apparently there is no ': '.")
+            if line[0] in field:  # several entries
+                try:  # works if it is already a list
+                    field[line[0]].append(line[1])
+                except AttributeError:
+                    field[line[0]] = [field[line[0]], line[1]]
+            else:  # information is new
+                field[line[0]] = line[1]
+    if len(field) == 0:
+        # pdftk output ended with ---
+        raise ValueError("Output of pdftk dump_data_fields should end " +
+                         "with a field.")
+    else:
+        fields.append(field)
+    return fields
+
+
+if __name__ == "__main__":
+    # start
+    args = parse()
+    formfields = listFields(args["pdffile"])
+    # next todos:
+    # • ask user about a single field by creating a pdf that has something
+    # written in this field
+    # • ask user for all fields
+    # • save those information in a suitable file
+    # • read from this file and do not ask for things that are already
+    # saved in this file