pdfformfill/readformdata.py

#! /usr/bin/python3
# -*- coding: UTF-8 -*-
"""Functions for getting data from files.

If started as the main file, ask the user what the fields in
the pdf entered on the command line are.

"""

import subprocess as cmd
import argparse
from formfield import FormField
from constants import ConfigError, LISTINFOS, COMMENT_SYMBOL


def listFields(fields, newfile):  # NOQA
    """Use pdftk dump_data_fields output or a file to add to a data fields list.

    Lines starting with # will be ignored (comments).
    (# must be the first non-blank symbol in the line!)

    Attribute:
        fields [FormField]: list of FormFields to add to
        -> will be changed and returned
        newfile (str): pdf file or form-file with the form (data)
        If newfile is .pdf, use pdftk dump_data_fields_utf8
        If newfile is other, read as if it was output of pdftk dump_data_fields

    Results:
        list of all form fields. Each field is a dictionary of information.
        Typically those are:
            FieldType
            FieldName
            FieldNameAlt
            FieldFlags
            FieldJustification
            FieldStateOption (list of several options)

    Raises
    ------
        ValueError: if a line cannot be parsed
        some io error if file does not exist or pdftk has problems

    """
    if newfile.endswith(".pdf"):
        dump = cmd.Popen(["pdftk", newfile, "dump_data_fields_utf8"],
                         stdout=cmd.PIPE,
                         universal_newlines=True)
        field_desc, _ = dump.communicate()
    else:
        with open(newfile) as inputfile:
            field_desc = inputfile.read()
    field_desc = field_desc.splitlines()
    # stdout=sp.PIPE
    # field_desc, _ = dump.communicate()  # second result would be error
    # fields = []
    # includes both inputs: pdftk and file
    field_desc = [l for l in field_desc if l.strip() != "" and not
                  l.strip().startswith(COMMENT_SYMBOL)]
    if field_desc[0] != "---":
        raise ConfigError("Output of pdftk dump_data_fields should start " +
                          "with ---. (Config files as well.)")
    # ignore empty lines and comment lines
    field_desc.append("---")  # to include last FormField
    field = FormField()
    for line in field_desc[1:]:  # ignore first ---
        if line == "---":
            if len(field) < 0:
                raise ValueError("Several --- following each other.")
            # else add:
            for f in fields:
                try:
                    if f["FieldName"] == field["FieldName"]:
                        f.merge(field)
                        break
                except KeyError as ke:
                    raise KeyError("Some field has no 'FieldName': "
                                   + str(ke))
            else:
                # no field for merging found
                fields.append(field)
            field = FormField()  # start new Field
        else:
            line = line.split(": ", maxsplit=1)
            # try:
            # line[1] = FormField.convertUmlauts(line[1])
            # except IndexError:
            if len(line) == 1:
                raise ConfigError("The line '" + str(line) +
                                  "' cannot be parsed. " +
                                  "Apparently there is no ': '.")
            if line[0] in field:  # several entries
                if line[0] in LISTINFOS:
                    field[line[0]].append(line[1])
                else:
                    raise ConfigError("The information " + line[0] +
                                      " appeared twice in one field.")
            else:
                if line[0] in LISTINFOS:
                    field[line[0]] = [line[1]]
                else:
                    field[line[0]] = line[1]

    return fields


def writeFields(fields, formfile, onlyNonEmpty=True):
    """Write the data saved in fields to formfile.

    Attributes:
        onlyNonEmpty (Bool): True=ignore fields that only have the information
        that comes directly from the pdf. (i.e. all infos begin with "Field".)

    """
    with open(formfile, mode="w") as outfile:
        for field in fields:
            if not (field.essentiallyempty() and onlyNonEmpty):
                outfile.write("\n---\n")
                outfile.write(str(field))


def parse():
    """Parse the arguments.

    Results:
        input pdf file (["pdffile"]
        output info file (["output"])

    """
    parser = argparse.ArgumentParser(
        description="Go through the list of fields of a pdf form" +
        " and ask for each one for a helpful description.")
    parser.add_argument("pdffile",
                        help="The pdf file with the form fields.")
    parser.add_argument("output", default="form.form",
                        help="The file that includes all the information " +
                        "collected in this script.\n" +
                        "If it exists already, add to the information saved.")
    parser.add_argument("-u", "--update", help="Also ask for information " +
                        "about fields that already have information stored.",
                        action="store_true")
    return vars(parser.parse_args())


def overviewFields(fields):
    """Create a list of FieldName:goodName.

    Can be written to a file to help someone keeping the overview while
    writing the config for a form.

    """
    return "\n".join(["'" + f["FieldName"] + "' : " + f["Name"] for f in fields
                      if "Name" in f])


if __name__ == "__main__":
    # start
    args = parse()
    formfields = listFields([], args["pdffile"])
    try:
        formfields = listFields(formfields, args["output"])
        # otherwise nothing to be added
    except FileNotFoundError:
        pass
    for fi in formfields:
        try:
            fi.askUser(args["pdffile"], update=args["update"])
        except (KeyboardInterrupt, EOFError):
            break
    writeFields(formfields, args["output"])
    with open("ov_" + args["output"], mode="w") as outputfile:
        outputfile.write(overviewFields(formfields))

    # next todos:
    # • evince should display the right page somehow, maybe give the user
    # the possibility to say "this was on page 2, assume next one is on page 2
    # as well"