diff --git a/anonymize_entries.py b/anonymize_entries.py index 79f846a..af91628 100644 --- a/anonymize_entries.py +++ b/anonymize_entries.py @@ -145,16 +145,24 @@ def maybe_generate_aliases( if reuse_aliases: print("Reusing generated aliases") return False - else: - for row in csv_contents: - random = Random( - "; ".join([row[KnownColumns.EmailAddress], args.csv, args.files]) + + alias_to_email_address = {} + + for row in csv_contents: + rnd = Random("; ".join([row[KnownColumns.EmailAddress], args.csv, args.files])) + random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}" + while ( + random_alias in alias_to_email_address + and alias_to_email_address[random_alias] != row[KnownColumns.EmailAddress] + ): + print( + f"WARNING: rerolling alias for {row[KnownColumns.EmailAddress]} due to collision with {alias_to_email_address[random_alias]}" ) - row[KnownColumns.GeneratedAlias] = ( - f"{random.choice(alias_parts[0])} {random.choice(alias_parts[0])}" - ) - print("Generated an alias for each entry") - return True + random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}" + row[KnownColumns.GeneratedAlias] = random_alias + + print("Generated an alias for each entry") + return True def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvContents: