website/removeBadSymbols.py

322 lines
11 KiB
Python
Raw Normal View History

2021-01-11 02:59:31 +01:00
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""Rename files with bad characters.
The shell often is complicated if files have weird name.
This script renames files in a directory tree so that they
do not include those bad characters. The worst are whitespace.
"""
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import logging
2021-02-12 19:41:21 +01:00
# import re as regex
2021-01-11 02:59:31 +01:00
import os
import os.path
LOGGING_LEVELS = {"debug": logging.DEBUG,
"info": logging.INFO,
"warning": logging.WARNING,
"error": logging.ERROR
}
INVALID_NAMES = ["", ".", ".."]
2021-02-12 19:41:21 +01:00
# IGNORED_NAME_REGEX = regex.compile(r"(__.*__\.(py|html|txt)|.*\.class)")
# is done with function
2021-01-11 02:59:31 +01:00
# most common replacement string
REPLACER = "_"
2021-10-31 19:34:07 +01:00
SINGLE_SYMBOLS = r'ãāǎàąćčēéěèȩęêėīíǐìĩïıńōóǒòøũūúǔùşśļłŁĻķўźżĶḩģĢḨņŅŃŗŖĀĄǍÀĆĒÉĚĘÈĪÍǏÌŚŌÓǑÒŪÚǓÙŹŻý='
SINGLE_REPLACE_SYMBOLS = r'aaaaacceeeeeeeeiiiiiiinooooouuuuussllLLkyzzKhgGHnNNrRAAAACEEEEEIIIISOOOOUUUUZZy--'
# from https://programminghistorian.org/en/lessons/transliterating:
cyrillic = {'\u0410': 'A', '\u0430': 'a',
'\u0411': 'B', '\u0431': 'b',
'\u0412': 'V', '\u0432': 'v',
'\u0413': 'G', '\u0433': 'g',
'\u0414': 'D', '\u0434': 'd',
'\u0415': 'E', '\u0435': 'e',
'\u0416': 'Zh', '\u0436': 'zh',
'\u0417': 'Z', '\u0437': 'z',
'\u0418': 'I', '\u0438': 'i',
'\u0419': 'I', '\u0439': 'i',
'\u041a': 'K', '\u043a': 'k',
'\u041b': 'L', '\u043b': 'l',
'\u041c': 'M', '\u043c': 'm',
'\u041d': 'N', '\u043d': 'n',
'\u041e': 'O', '\u043e': 'o',
'\u041f': 'P', '\u043f': 'p',
'\u0420': 'R', '\u0440': 'r',
'\u0421': 'S', '\u0441': 's',
'\u0422': 'T', '\u0442': 't',
'\u0423': 'U', '\u0443': 'u',
'\u0424': 'F', '\u0444': 'f',
'\u0425': 'Kh', '\u0445': 'kh',
'\u0426': 'Ts', '\u0446': 'ts',
'\u0427': 'Ch', '\u0447': 'ch',
'\u0428': 'Sh', '\u0448': 'sh',
'\u0429': 'Shch', '\u0449': 'shch',
'\u042a': '"', '\u044a': '"',
'\u042b': 'Y', '\u044b': 'y',
'\u042c': "'", '\u044c': "'",
'\u042d': 'E', '\u044d': 'e',
'\u042e': 'Iu', '\u044e': 'iu',
'\u042f': 'Ia', '\u044f': 'ia'}
2021-01-11 02:59:31 +01:00
2021-02-12 19:41:21 +01:00
REPLACEMENTS = {'\n': REPLACER,
2021-10-31 19:34:07 +01:00
"ǖ": "ue",
"ǘ": "ue",
"ǚ": "ue",
"ǜ": "ue",
"Ǖ": "Ue",
"Ǘ": "Ue",
"Ǚ": "Ue",
"Ǜ": "Ue",
2021-02-12 19:41:21 +01:00
r"'": "",
2021-01-11 02:59:31 +01:00
r'"': "",
r"`": "",
r"´": "",
r"&": REPLACER + "und" + REPLACER,
r"%": "Prozent" + REPLACER,
2021-01-11 02:59:31 +01:00
r"*": "x",
r"(with lyrics)": "",
r" ": REPLACER,
r"C#": "C_sharp",
r"c#": "c_sharp",
r"#": REPLACER,
r"|": "l",
r",": REPLACER,
r"{": "",
r"}": "",
r"(": "",
r")": "",
r"[": "",
r"]": "",
r"~": "-",
r":": REPLACER,
r"@": "-at-",
r"?": "",
r">": REPLACER,
r"<": REPLACER,
r"ä": "ae",
2021-10-03 09:12:52 +02:00
r"æ": "ae",
2021-01-11 02:59:31 +01:00
r"Ä": "Ae",
r"á": "au", # icelandic
r"Á": "Au",
r"ö": "oe",
r"Ö": "Oe",
r"ü": "ue",
r"Ü": "Ue",
r"": "SS",
r"ß": "ss",
r"ð": "dh",
r"Ð": "Dh",
r"þ": "th",
r"Þ": "Th",
r"μFSR": "myFSR",
2022-11-07 15:36:22 +01:00
r"μfsr": "myfsr",
2021-01-11 02:59:31 +01:00
r"μ": "mu",
2022-11-07 15:36:22 +01:00
r"φFSR": "phyFSR",
r"φfsr": "phyfsr",
r"φ": "phi",
2021-01-11 02:59:31 +01:00
r"$": "USD",
r"": "EUR",
r".jpeg": ".jpg",
r".JPG": ".jpg",
r".JPEG": ".jpg",
2021-02-12 19:41:21 +01:00
"\u202f": "_", # some space
2021-06-22 17:15:58 +02:00
"": "Nr",
"°C": "degCelsius",
2021-10-31 19:34:07 +01:00
"\\": REPLACER
}
2021-02-12 19:41:21 +01:00
REPLACEMENTS.update(
{s: SINGLE_REPLACE_SYMBOLS[i] for i, s in enumerate(SINGLE_SYMBOLS)})
2021-10-31 19:34:07 +01:00
REPLACEMENTS.update(cyrillic)
2021-01-11 02:59:31 +01:00
for begin, end, replacement in [
('2000', '200F', REPLACER), # various spaces and formatters
('2010', '2017', '-'), # various dashes
('2018', '201F', ""), # various apostrophies
('2028', '202E', ""), # separators, format characters
('2032', '203A', "") # various apostrophies and quotation marks
# one could add many more
]:
2022-01-31 17:28:37 +01:00
REPLACEMENTS.update({chr(hexspace): replacement # replacing 0x by u
2021-01-11 02:59:31 +01:00
for hexspace in range(int(begin, 16), int(end, 16) + 1)})
def parse_args():
"""Parse command line arguments.
Returns:
Dictionary with the command line arguments.
"""
parser = ArgumentParser(
description="""Remove bad characters by renaming files.
Which files are renamed is logged in a log file.
(See below in --log.)""",
formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-a", "--noask", "--no-interactive",
dest="ask",
help=("Specify if renaming should not be confirmed by the user" +
" but just done."),
action="store_false"
)
parser.add_argument(
"-l", "--onlylog",
dest="rename",
action="store_false",
help="Dry run."
)
parser.add_argument(
"--log", "--logfile",
dest="log",
metavar="L",
help="""the file where to write the logging output.
This file is overwritten if existing.
If not specified assume 'rename.log'""",
action="store",
default=os.path.join("..", "rename.log")
)
parser.add_argument(
"--loglevel", "--level",
dest="loglevel",
help="""Specify the log level.
'error' includes OS errors at renaming files.
'warning' additionally includes no-renames because of weird resulting file name.
'info' includes no-renames because of existing files and renames and working directories.
'debug' includes good files.""",
default="info",
choices=LOGGING_LEVELS.keys()
)
2021-06-22 17:15:58 +02:00
parser.add_argument(
"-q", "--quiet",
dest="quiet",
help=("Set log level to warning, so usually do not create a log file. "
+ "Overwrites loglevel option."),
action="store_true"
)
2021-01-11 02:59:31 +01:00
parser.add_argument(
nargs="?",
dest="top",
metavar="directory",
help="""The directory in which to rename files.""",
action="store",
default=".")
2021-06-22 17:15:58 +02:00
args = parser.parse_args()
if args.quiet:
args.loglevel = "warning"
return args
2021-01-11 02:59:31 +01:00
def transform_filename(name: str):
"""Remove all bad symbols from name."""
for symbol, replacement in REPLACEMENTS.items():
name = name.replace(symbol, replacement)
name.lstrip("-")
while True:
old_name = name
for separator in ["-", "_", "."]:
name = name.replace(REPLACER + separator, separator)
name = name.replace(separator + REPLACER, separator)
name = name.replace(separator + separator, separator)
if old_name == name:
break
return name
def rename_file(directory, file, args):
2021-06-22 17:15:58 +02:00
"""Rename file directory/file if renaming is OK.
Returns:
filename after processing
"""
2021-01-11 02:59:31 +01:00
new_name = transform_filename(file)
path = os.path.join(directory, file)
new_path = os.path.join(directory, new_name)
if new_name == file:
logging.debug("'{}' is OK.".format(file))
2021-06-22 17:15:58 +02:00
return file
2021-01-11 02:59:31 +01:00
if os.path.lexists(new_path):
logging.info("'{}' is not renamed to '{}' because this already exists.".format(
2021-02-12 19:41:21 +01:00
file, new_name
2021-01-11 02:59:31 +01:00
))
2021-06-22 17:15:58 +02:00
return file
2021-01-11 02:59:31 +01:00
if new_name in INVALID_NAMES:
logging.warning("'{}' is not renamed because it would invalid: '{}'.".format(
path, new_name))
2021-06-22 17:15:58 +02:00
return file
2021-01-11 02:59:31 +01:00
if args.ask:
rename = input("Rename '{}' to '{}'? (Enter for yes)".format(
path, new_name
))
2021-03-17 19:25:16 +01:00
rename = not rename.lower().startswith("n")
2021-01-11 02:59:31 +01:00
else:
rename = True
if rename:
2021-06-22 17:15:58 +02:00
logging.info("Rename '{}' to '{}'".format(file, new_name))
2021-01-11 02:59:31 +01:00
if args.rename: # == not onlylog
try:
os.rename(path, new_path)
except FileNotFoundError as error:
logging.error("Could not move '{}' to '{}' due to FileNotFoundError: {}".format(
path, new_name, error
))
2021-06-22 17:15:58 +02:00
return file
else:
return new_name
2021-01-11 02:59:31 +01:00
else:
logging.info("Did not rename '{}' to '{}' due to user choice.".format(
file, new_name
))
2021-06-22 17:15:58 +02:00
return file
2021-02-12 19:41:21 +01:00
2021-01-11 02:59:31 +01:00
def if_ignore(filename):
"""Return if this filename should be ignored.
2021-02-12 19:41:21 +01:00
2021-01-11 02:59:31 +01:00
If it's a directory, this means that the directory content is ignored as well."""
2021-02-12 19:41:21 +01:00
return (
filename.startswith('.') or
filename == '__pycache__' or (
filename.startswith('__') and ( # special python file
filename.endswith('__.py') or
filename.endswith('__.html') or
filename.endswith('__.txt')
2021-06-22 17:15:58 +02:00
)) or filename.endswith('.class') or # java class file
filename == "telegram-nachrichten")
2021-01-11 02:59:31 +01:00
if __name__ == "__main__":
args = parse_args()
logging.basicConfig(
filename=args.log,
level=LOGGING_LEVELS[args.loglevel]
)
for dirpath, dirnames, filenames in os.walk(args.top, topdown=True):
print('.', end='')
logging.info("Go to directory '{}'".format(os.path.abspath(dirpath)))
for dir in dirnames:
2021-02-12 19:41:21 +01:00
if if_ignore(dir):
2021-01-11 02:59:31 +01:00
logging.debug("Ignore dir '{}'".format(dir))
dirnames.remove(dir)
else:
2021-06-22 17:15:58 +02:00
new_name = rename_file(dirpath, dir, args)
if new_name not in dirnames:
dirnames.append(new_name)
2021-01-11 02:59:31 +01:00
for file in filenames:
if if_ignore(file):
logging.debug("Ignore file '{}'".format(file))
else:
rename_file(dirpath, file, args)
# remove empty log file:
logging.shutdown()
if os.stat(args.log).st_size == 0:
os.remove(args.log)