fix umlaut handling

2018-02-15 20:41:35 +01:00 · 2018-02-15 20:41:35 +01:00 · 9b2bc16c8a
commit 9b2bc16c8a
parent 836c02060f
1 changed files with 41 additions and 7 deletions
--- a/identify.py
+++ b/identify.py
@ -13,6 +13,7 @@ The information stored for the fields are:

 # todo: Button Varianten sind Off, On, sollte anpassbar/ automatisch
 # angepasst werden
+# todo: disable focus on pdf  - apparently not simple :(

 import argparse
 import subprocess as cmd
@ -34,8 +35,13 @@ class FormField():
        standard value (["Stdvalue"]) todo: how to implement that?
        something about a special treatment (["Special"])

+    Constants:
+        LISTINFOS: those infos can appear several times, hence store a list.
+
    """

+    LISTINFOS = ["FieldStateOption"]
+
    def convertToBool(self, possBool):
        """Convert value to True or or False if this field is a Checkbox.

@ -62,12 +68,34 @@ class FormField():
            # maybe something more intelligent necessary
            return possBool

+    @staticmethod
+    def convertUmlauts(value):
+        """Convert all html-Syntax-Umlauts to the correct symbols.
+
+        Necessary to make fdfgen generate files that are read correctly
+        by pdftk.
+
+        Returns:
+            value, &#number replaced by the correct unicode
+
+        """
+        for entity, uni in [("&#196;", "Ä"),
+                            ("&#214;", "Ö"),
+                            ("&#220;", "Ü"),
+                            ("&#223;", "ß"),
+                            ("&#228;", "ä"),
+                            ("&#246;", "ö"),
+                            ("&#252;", "ü")]:
+            value = value.replace(entity, uni)
+        return value
+
    def __init__(self):
        """
        Initialise a FormField.

        With an empty dictionary for pdfinfo
        and None for the other values.
+
        """
        self.__pdfinfo = {}

@ -204,6 +232,7 @@ class FormField():
                    """Show a special message for buttons.

                    Explains Button options.
+
                    """
                    if self["FieldType"] == "Button":
                        print("It is a butten.",
@ -363,18 +392,23 @@ def listFields(pdf_file, fieldListFile):  # NOQA
            field = FormField()  # start new Field
        else:
            line = line.split(": ", maxsplit=1)
-            if len(line) != 2:
+            try:
+                line[1] = FormField.convertUmlauts(line[1])
+            except IndexError:
                raise ValueError("The line '" + str(line) +
                                 "' cannot be parsed. " +
                                 "Apparently there is no ': '.")
            if line[0] in field:  # several entries
-                try:  # works if it is already a list
+                if line[0] in FormField.LISTINFOS:
                    field[line[0]].append(line[1])
-                except AttributeError:  # append does not exist for str :)
-                    field[line[0]] = [field[line[0]], line[1]]
-                    # include existing info
-            else:  # information is new
-                field[line[0]] = line[1]
+                else:
+                    raise ValueError("The information " + line[0] +
+                                     "appeared twice in one field.")
+            else:
+                if line[0] in FormField.LISTINFOS:
+                    field[line[0]] = [line[1]]
+                else:
+                    field[line[0]] = line[1]

    return fields