pdfformfill/identify.py

#! /usr/bin/python3
"""Help decrypting the field names in pdf files.

A script that goes through the list of pdf form fields and asks the
user for suitable descriptions.
"""

import argparse
import subprocess as cmd
# import fdfgen


def parse():
    """
    Parse the arguments.

    Results:
        ...

    """
    parser = argparse.ArgumentParser(
        description="Go through the list of fields of a pdf form" +
        " and ask for each one for a helpful description.")
    parser.add_argument("pdffile",
                        help="The pdf file with the form fields.")
    parser.add_argument("output", default="form.form",
                        help="The file that includes all the information " +
                        "collected in this script.")
    return vars(parser.parse_args())


def listFields(pdf_file):
    """Use pdftk dump_data_fields to generate a list of all data fields.

    Attribute:
        pdf_file (str): pdf file with the form

    Results:
        list of all form fields. Each field is a dictionary of information.
        Typically those are:
            FieldType
            FieldName
            FieldNameAlt
            FieldFlags
            FieldJustification
            FieldStateOption (list of several options)

    Raises
    ------
        ValueError: if a line cannot be parsed
        some io error if file does not exist or pdftk has problems

    """
    dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,
                     universal_newlines=True)
    # stdout=sp.PIPE
    field_desc, _ = dump.communicate()  # second result would be error
    fields = []
    field = {}
    field_desc = field_desc.splitlines()
    if field_desc[0] != "---":
        raise ValueError("Output of pdftk dump_data_fields should start " +
                         "with ---.")
    for line in field_desc[1:]:  # ignore first ---
        if line == "---":
            fields.append(field)
            field = {}
        else:
            line = line.split(": ", maxsplit=1)
            if len(line) != 2:
                raise ValueError("The line '" + line +
                                 "' cannot be parsed. " +
                                 "Apparently there is no ': '.")
            if line[0] in field:  # several entries
                try:  # works if it is already a list
                    field[line[0]].append(line[1])
                except AttributeError:
                    field[line[0]] = [field[line[0]], line[1]]
            else:  # information is new
                field[line[0]] = line[1]
    if len(field) == 0:
        # pdftk output ended with ---
        raise ValueError("Output of pdftk dump_data_fields should end " +
                         "with a field.")
    else:
        fields.append(field)
    return fields


if __name__ == "__main__":
    # start
    args = parse()
    formfields = listFields(args["pdffile"])
    # next todos:
    # • ask user about a single field by creating a pdf that has something
    # written in this field
    # • ask user for all fields
    # • save those information in a suitable file
    # • read from this file and do not ask for things that are already
    # saved in this file
read form fields from pdftk output 2018-02-07 16:27:15 +01:00			`#! /usr/bin/python3`
			`"""Help decrypting the field names in pdf files.`

			`A script that goes through the list of pdf form fields and asks the`
			`user for suitable descriptions.`
			`"""`

			`import argparse`
			`import subprocess as cmd`
			`# import fdfgen`


			`def parse():`
			`"""`
			`Parse the arguments.`

			`Results:`
			`...`

			`"""`
			`parser = argparse.ArgumentParser(`
			`description="Go through the list of fields of a pdf form" +`
			`" and ask for each one for a helpful description.")`
			`parser.add_argument("pdffile",`
			`help="The pdf file with the form fields.")`
			`parser.add_argument("output", default="form.form",`
			`help="The file that includes all the information " +`
			`"collected in this script.")`
			`return vars(parser.parse_args())`


			`def listFields(pdf_file):`
			`"""Use pdftk dump_data_fields to generate a list of all data fields.`

			`Attribute:`
			`pdf_file (str): pdf file with the form`

			`Results:`
			`list of all form fields. Each field is a dictionary of information.`
			`Typically those are:`
			`FieldType`
			`FieldName`
			`FieldNameAlt`
			`FieldFlags`
			`FieldJustification`
			`FieldStateOption (list of several options)`

			`Raises`
			`------`
			`ValueError: if a line cannot be parsed`
			`some io error if file does not exist or pdftk has problems`

			`"""`
			`dump = cmd.Popen(["pdftk", pdf_file, "dump_data_fields"], stdout=cmd.PIPE,`
			`universal_newlines=True)`
			`# stdout=sp.PIPE`
			`field_desc, _ = dump.communicate() # second result would be error`
			`fields = []`
			`field = {}`
			`field_desc = field_desc.splitlines()`
			`if field_desc[0] != "---":`
			`raise ValueError("Output of pdftk dump_data_fields should start " +`
			`"with ---.")`
			`for line in field_desc[1:]: # ignore first ---`
			`if line == "---":`
			`fields.append(field)`
			`field = {}`
			`else:`
			`line = line.split(": ", maxsplit=1)`
			`if len(line) != 2:`
			`raise ValueError("The line '" + line +`
			`"' cannot be parsed. " +`
			`"Apparently there is no ': '.")`
			`if line[0] in field: # several entries`
			`try: # works if it is already a list`
			`field[line[0]].append(line[1])`
			`except AttributeError:`
			`field[line[0]] = [field[line[0]], line[1]]`
			`else: # information is new`
			`field[line[0]] = line[1]`
			`if len(field) == 0:`
			`# pdftk output ended with ---`
			`raise ValueError("Output of pdftk dump_data_fields should end " +`
			`"with a field.")`
			`else:`
			`fields.append(field)`
			`return fields`


			`if __name__ == "__main__":`
			`# start`
			`args = parse()`
			`formfields = listFields(args["pdffile"])`
			`# next todos:`
			`# • ask user about a single field by creating a pdf that has something`
			`# written in this field`
			`# • ask user for all fields`
			`# • save those information in a suitable file`
			`# • read from this file and do not ask for things that are already`
			`# saved in this file`