website/removeBadSymbols.py
2024-07-21 11:53:13 +02:00

329 lines
11 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""Rename files with bad characters.
The shell often is complicated if files have weird name.
This script renames files in a directory tree so that they
do not include those bad characters. The worst are whitespace.
"""
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import logging
# import re as regex
import os
import os.path
LOGGING_LEVELS = {"debug": logging.DEBUG,
"info": logging.INFO,
"warning": logging.WARNING,
"error": logging.ERROR
}
INVALID_NAMES = ["", ".", ".."]
# IGNORED_NAME_REGEX = regex.compile(r"(__.*__\.(py|html|txt)|.*\.class)")
# is done with function
# most common replacement string
REPLACER = "_"
SINGLE_SYMBOLS = r'ãāǎàąćčēéěèȩęêėīíǐìĩïıńōóǒòøũūúǔùşśļłŁĻķўźżĶḩģĢḨņŅŃŗŖĀĄǍÀĆĒÉĚĘÈĪÍǏÌŚŌÓǑÒŪÚǓÙŹŻý='
SINGLE_REPLACE_SYMBOLS = r'aaaaacceeeeeeeeiiiiiiinooooouuuuussllLLkyzzKhgGHnNNrRAAAACEEEEEIIIISOOOOUUUUZZy---'
# from https://programminghistorian.org/en/lessons/transliterating:
cyrillic = {'\u0410': 'A', '\u0430': 'a',
'\u0411': 'B', '\u0431': 'b',
'\u0412': 'V', '\u0432': 'v',
'\u0413': 'G', '\u0433': 'g',
'\u0414': 'D', '\u0434': 'd',
'\u0415': 'E', '\u0435': 'e',
'\u0416': 'Zh', '\u0436': 'zh',
'\u0417': 'Z', '\u0437': 'z',
'\u0418': 'I', '\u0438': 'i',
'\u0419': 'I', '\u0439': 'i',
'\u041a': 'K', '\u043a': 'k',
'\u041b': 'L', '\u043b': 'l',
'\u041c': 'M', '\u043c': 'm',
'\u041d': 'N', '\u043d': 'n',
'\u041e': 'O', '\u043e': 'o',
'\u041f': 'P', '\u043f': 'p',
'\u0420': 'R', '\u0440': 'r',
'\u0421': 'S', '\u0441': 's',
'\u0422': 'T', '\u0442': 't',
'\u0423': 'U', '\u0443': 'u',
'\u0424': 'F', '\u0444': 'f',
'\u0425': 'Kh', '\u0445': 'kh',
'\u0426': 'Ts', '\u0446': 'ts',
'\u0427': 'Ch', '\u0447': 'ch',
'\u0428': 'Sh', '\u0448': 'sh',
'\u0429': 'Shch', '\u0449': 'shch',
'\u042a': '"', '\u044a': '"',
'\u042b': 'Y', '\u044b': 'y',
'\u042c': "'", '\u044c': "'",
'\u042d': 'E', '\u044d': 'e',
'\u042e': 'Iu', '\u044e': 'iu',
'\u042f': 'Ia', '\u044f': 'ia'}
REPLACEMENTS = {'\n': REPLACER,
"ǖ": "ue",
"ǘ": "ue",
"ǚ": "ue",
"ǜ": "ue",
"Ǖ": "Ue",
"Ǘ": "Ue",
"Ǚ": "Ue",
"Ǜ": "Ue",
r"'": "",
r'"': "",
r"`": "",
r"´": "",
r"+": REPLACER + "und" + REPLACER,
r"&": REPLACER + "und" + REPLACER,
r"%": "Prozent" + REPLACER,
r"*": "x",
r"=": "-",
r"(with lyrics)": "",
r" ": REPLACER,
r"C#": "C_sharp",
r"c#": "c_sharp",
r"#": REPLACER,
r"|": "l",
r",": REPLACER,
r"{": "",
r"}": "",
r"(": "",
r")": "",
r"[": "",
r"]": "",
r"~": "-",
r":": REPLACER,
r"@": "-at-",
r"?": REPLACER,
r"": REPLACER,
r"": REPLACER,
r"!": REPLACER,
r"": REPLACER,
r"": REPLACER,
r">": REPLACER,
r"<": REPLACER,
r"ä": "ae",
r"æ": "ae",
r"Ä": "Ae",
r"á": "au", # icelandic
r"Á": "Au",
r"ö": "oe",
r"Ö": "Oe",
r"ü": "ue",
r"Ü": "Ue",
r"": "SS",
r"ß": "ss",
r"ð": "dh",
r"Ð": "Dh",
r"þ": "th",
r"Þ": "Th",
r"μFSR": "myFSR",
r"μfsr": "myfsr",
r"μ": "mu",
r"φFSR": "phyFSR",
r"φfsr": "phyfsr",
r"φ": "phi",
r"$": "USD",
r"": "EUR",
r".jpeg": ".jpg",
r".JPG": ".jpg",
r".JPEG": ".jpg",
"\u202f": "_", # some space
"": "Nr",
"°C": "degCelsius",
"": "_",
"\\": REPLACER
}
REPLACEMENTS.update(
{s: SINGLE_REPLACE_SYMBOLS[i] for i, s in enumerate(SINGLE_SYMBOLS)})
REPLACEMENTS.update(cyrillic)
for begin, end, replacement in [
('2000', '200F', REPLACER), # various spaces and formatters
('2010', '2017', '-'), # various dashes
('2018', '201F', ""), # various apostrophies
('2028', '202E', ""), # separators, format characters
('2032', '203A', "") # various apostrophies and quotation marks
# one could add many more
]:
REPLACEMENTS.update({chr(hexspace): replacement # replacing 0x by u
for hexspace in range(int(begin, 16), int(end, 16) + 1)})
def parse_args():
"""Parse command line arguments.
Returns:
Dictionary with the command line arguments.
"""
parser = ArgumentParser(
description="""Remove bad characters by renaming files.
Which files are renamed is logged in a log file.
(See below in --log.)""",
formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-a", "--noask", "--no-interactive",
dest="ask",
help=("Specify if renaming should not be confirmed by the user" +
" but just done."),
action="store_false"
)
parser.add_argument(
"-l", "--onlylog",
dest="rename",
action="store_false",
help="Dry run."
)
parser.add_argument(
"--log", "--logfile",
dest="log",
metavar="L",
help="""the file where to write the logging output.
This file is overwritten if existing.
If not specified assume 'rename.log'""",
action="store",
default=os.path.join("..", "rename.log")
)
parser.add_argument(
"--loglevel", "--level",
dest="loglevel",
help="""Specify the log level.
'error' includes OS errors at renaming files.
'warning' additionally includes no-renames because of weird resulting file name.
'info' includes no-renames because of existing files and renames and working directories.
'debug' includes good files.""",
default="info",
choices=LOGGING_LEVELS.keys()
)
parser.add_argument(
"-q", "--quiet",
dest="quiet",
help=("Set log level to warning, so usually do not create a log file. "
+ "Overwrites loglevel option."),
action="store_true"
)
parser.add_argument(
nargs="?",
dest="top",
metavar="directory",
help="""The directory in which to rename files.""",
action="store",
default=".")
args = parser.parse_args()
if args.quiet:
args.loglevel = "warning"
return args
def transform_filename(name: str):
"""Remove all bad symbols from name."""
for symbol, replacement in REPLACEMENTS.items():
name = name.replace(symbol, replacement)
name.lstrip("-")
while True:
old_name = name
for separator in ["-", "_", "."]:
name = name.replace(REPLACER + separator, separator)
name = name.replace(separator + REPLACER, separator)
name = name.replace(separator + separator, separator)
if old_name == name:
break
return name
def rename_file(directory, file, args):
"""Rename file directory/file if renaming is OK.
Returns:
filename after processing
"""
new_name = transform_filename(file)
path = os.path.join(directory, file)
new_path = os.path.join(directory, new_name)
if new_name == file:
logging.debug("'{}' is OK.".format(file))
return file
if os.path.lexists(new_path):
logging.info("'{}' is not renamed to '{}' because this already exists.".format(
file, new_name
))
return file
if new_name in INVALID_NAMES:
logging.warning("'{}' is not renamed because it would invalid: '{}'.".format(
path, new_name))
return file
if args.ask:
rename = input("Rename '{}' to '{}'? (Enter for yes)".format(
path, new_name
))
rename = not rename.lower().startswith("n")
else:
rename = True
if rename:
logging.info("Rename '{}' to '{}'".format(file, new_name))
if args.rename: # == not onlylog
try:
os.rename(path, new_path)
except FileNotFoundError as error:
logging.error("Could not move '{}' to '{}' due to FileNotFoundError: {}".format(
path, new_name, error
))
return file
else:
return new_name
else:
logging.info("Did not rename '{}' to '{}' due to user choice.".format(
file, new_name
))
return file
def if_ignore(filename):
"""Return if this filename should be ignored.
If it's a directory, this means that the directory content is ignored as well."""
return (
filename.startswith('.') or
filename == '__pycache__' or (
filename.startswith('__') and ( # special python file
filename.endswith('__.py') or
filename.endswith('__.html') or
filename.endswith('__.txt')
)) or filename.endswith('.class') or # java class file
filename == "telegram-nachrichten")
if __name__ == "__main__":
args = parse_args()
logging.basicConfig(
filename=args.log,
level=LOGGING_LEVELS[args.loglevel]
)
for dirpath, dirnames, filenames in os.walk(args.top, topdown=True):
print('.', end='')
logging.info("Go to directory '{}'".format(os.path.abspath(dirpath)))
for dir in dirnames:
if if_ignore(dir):
logging.debug("Ignore dir '{}'".format(dir))
dirnames.remove(dir)
else:
new_name = rename_file(dirpath, dir, args)
if new_name not in dirnames:
dirnames.append(new_name)
for file in filenames:
if if_ignore(file):
logging.debug("Ignore file '{}'".format(file))
else:
rename_file(dirpath, file, args)
# remove empty log file:
logging.shutdown()
if os.stat(args.log).st_size == 0:
os.remove(args.log)