pdfformfill/readformdata.py

#! /usr/bin/python3
# -*- coding: UTF-8 -*-
"""Functions for getting data from files.

Constants:
    COMMENT_SYMBOL = "#":
        symbol to indicate a comment line
    SUBINFO_SEP = "-":
        symbol to separate subinformation, e.g. for commands

"""

import subprocess as cmd
import re
import argparse
import commands
from formfield import FormField


COMMENT_SYMBOL = "#"
SUBINFO_SEP = "-"
VARIABLE_BEGIN_SEP = "{"
VARIABLE_END_SEP = "}"
VARIABLE_PARTS_SEP = "|"


class ConfigError(Exception):
    """There are a lot of mistakes one can make in writing config files.

    This Error indicates that some information is missing or some line is
    malformed.

    No __init__ supplied because the standard __init__ of Exception does
    a good job.

    """


class MobileRegex():
    """A little util class.

    To be used as a singleton.

    Check whether an info should be saved as a list.

    One could have a list of regex or just one regex but
    then commands.Command.__subclasses__() would be a circular
    import and that's bad.

    """

    def __contains__(self, info):
        """Return whether this info can appear several times.

        Attributes:
            info: the info queried for.

        """
        return (info in ["FieldStateOption"] or
                info in [scls.__name__ for scls in
                         commands.Command.__subclasses__()] or
                re.match(r"[^" + SUBINFO_SEP + r"]" +
                         SUBINFO_SEP + r"If.*", info))
        # If clauses for commands


LISTINFOS = MobileRegex()


def listFields(fields, newfile):  # NOQA
    """Use pdftk dump_data_fields output or a file to add to a data fields list.

    Lines starting with # will be ignored (comments).
    (# must be the first non-blank symbol in the line!)

    Attribute:
        fields [FormField]: list of FormFields to add to
        -> will be changed and returned
        newfile (str): pdf file or form-file with the form (data)
        If newfile is .pdf, use pdftk dump_data_fields
        If newfile is other, read as if it was output of pdftk dump_data_fields

    Results:
        list of all form fields. Each field is a dictionary of information.
        Typically those are:
            FieldType
            FieldName
            FieldNameAlt
            FieldFlags
            FieldJustification
            FieldStateOption (list of several options)

    Raises
    ------
        ValueError: if a line cannot be parsed
        some io error if file does not exist or pdftk has problems

    """
    if newfile.endswith(".pdf"):
        dump = cmd.Popen(["pdftk", newfile, "dump_data_fields"],
                         stdout=cmd.PIPE,
                         universal_newlines=True)
        field_desc, _ = dump.communicate()
    else:
        with open(newfile) as inputfile:
            field_desc = inputfile.read()
    field_desc = field_desc.splitlines()
    # stdout=sp.PIPE
    # field_desc, _ = dump.communicate()  # second result would be error
    # fields = []
    # includes both inputs: pdftk and file
    field_desc = [l for l in field_desc if l.strip() != "" and not
                  l.strip().startswith(COMMENT_SYMBOL)]
    if field_desc[0] != "---":
        raise ValueError("Output of pdftk dump_data_fields should start " +
                         "with ---. (Config files as well.)")
    # ignore empty lines and comment lines
    field_desc.append("---")  # to include last FormField
    field = FormField()
    for line in field_desc[1:]:  # ignore first ---
        if line == "---":
            if len(field) < 0:
                raise ValueError("Several --- following each other.")
            # else add:
            for f in fields:
                try:
                    if f["FieldName"] == field["FieldName"]:
                        f.merge(field)
                        break
                except KeyError as ke:
                    raise KeyError("Some field has no 'FieldName': "
                                   + str(ke))
            else:
                # no field for merging found
                fields.append(field)
            field = FormField()  # start new Field
        else:
            line = line.split(": ", maxsplit=1)
            try:
                line[1] = FormField.convertUmlauts(line[1])
            except IndexError:
                raise ValueError("The line '" + str(line) +
                                 "' cannot be parsed. " +
                                 "Apparently there is no ': '.")
            if line[0] in field:  # several entries
                if line[0] in LISTINFOS:
                    field[line[0]].append(line[1])
                else:
                    raise ValueError("The information " + line[0] +
                                     " appeared twice in one field.")
            else:
                if line[0] in LISTINFOS:
                    field[line[0]] = [line[1]]
                else:
                    field[line[0]] = line[1]

    return fields


def writeFields(fields, formfile, onlyNonEmpty=True):
    """Write the data saved in fields to formfile.

    Attributes:
        onlyNonEmpty (Bool): True=ignore fields that only have the information
        that comes directly from the pdf. (i.e. all infos begin with "Field".)

    """
    with open(formfile, mode="w") as outfile:
        for field in fields:
            if not (field.essentiallyempty() and onlyNonEmpty):
                outfile.write("\n---\n")
                outfile.write(str(field))


def parse():
    """Parse the arguments.

    Results:
        input pdf file (["pdffile"]
        output info file (["output"])

    """
    parser = argparse.ArgumentParser(
        description="Go through the list of fields of a pdf form" +
        " and ask for each one for a helpful description.")
    parser.add_argument("pdffile",
                        help="The pdf file with the form fields.")
    parser.add_argument("output", default="form.form",
                        help="The file that includes all the information " +
                        "collected in this script.")
    parser.add_argument("-u", "--update", help="Also ask for information " +
                        "about fields that already have information stored.",
                        action="store_true")
    return vars(parser.parse_args())


def overviewFields(fields):
    """Create a list of FieldName:goodName.

    Can be written to a file to help someone keeping the overview while
    writing the config for a form.

    """
    return "\n".join(["'" + f["FieldName"] + "' : " + f["Name"] for f in fields
                      if "Name" in f])


if __name__ == "__main__":
    # start
    args = parse()
    formfields = listFields([], args["pdffile"])
    formfields = listFields(formfields, args["output"])
    for fi in formfields:
        continu = fi.askUser(args["pdffile"], update=args["update"])
        if not continu:
            break
    writeFields(formfields, args["output"])
    with open("ov_" + args["output"], mode="w") as outputfile:
        outputfile.write(overviewFields(formfields))

    # next todos:
    # • evince should display the right page somehow, maybe give the user
    # the possibility to say "this was on page 2, assume next one is on page 2
    # as well"