diff --git a/anonymize_entries.py b/anonymize_entries.py index 83ef17e..99e3529 100644 --- a/anonymize_entries.py +++ b/anonymize_entries.py @@ -217,7 +217,7 @@ def load_alias_parts(csvpath: str) -> tuple[list[str], list[str]]: def extract_alias_parts(csv: "_csv._reader"): return tuple(zip(*((line[0], line[1]) for n, line in enumerate(csv) if n > 0))) - with open(csvpath, "r") as csvfile: + with open(csvpath, "r", encoding="utf-8") as csvfile: alias_parts = extract_alias_parts(csv.reader(csvfile)) print(f"Loaded {sum(len(part) for part in alias_parts)} alias parts") @@ -262,21 +262,43 @@ def maybe_generate_aliases( print("Reusing generated aliases") return False + with open("aliases/usedaliases.txt", "r", encoding="utf-8") as usedaliases_file: + usedaliases = set( + line.rstrip().lower() for line in usedaliases_file if line.count(" ") == 1 + ) + + with open("aliases/suswords.txt", "r", encoding="utf-8") as suswords_file: + suswords = set(line.rstrip() for line in suswords_file) + alias_to_email_address = {} seed = args.seed or args.csv for row in csv_contents: rnd = Random(",".join([row[KnownColumns.EmailAddress], seed])) - random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}" - while ( - random_alias in alias_to_email_address - and alias_to_email_address[random_alias] != row[KnownColumns.EmailAddress] - ): - print( - f"WARNING: rerolling alias for {row[KnownColumns.EmailAddress]} due to collision with {alias_to_email_address[random_alias]}" - ) - random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}" + while True: + random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}" + if ( + random_alias in alias_to_email_address + and alias_to_email_address[random_alias] + != row[KnownColumns.EmailAddress] + ): + print( + f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} is already being used by {alias_to_email_address[random_alias]}" + ) + elif random_alias in usedaliases: + print( + f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} has already been used" + ) + elif any( + random_part in suswords for random_part in random_alias.split(" ") + ): + print( + f"WARNING: alias for {row[KnownColumns.EmailAddress]} {repr(random_alias)} contains a sus word" + ) + break + else: + break row[KnownColumns.GeneratedAlias] = random_alias print("Generated an alias for each entry") @@ -553,7 +575,7 @@ def main(argv: list[str]): raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs()) args = process_args(raw_args) assert_valid_file_paths(args) - alias_parts = load_alias_parts("aliasparts.csv") + alias_parts = load_alias_parts("aliases/aliasparts.csv") csv_contents = load_csv_contents(args) assert_known_google_forms_columns_present(csv_contents) dynamic_columns = detect_dynamic_columns(csv_contents)