update script to use alias directory

This commit is contained in:
Ash Garcia 2024-09-04 19:59:26 -07:00
parent 63a5727f50
commit d5c682cd40

View file

@ -217,7 +217,7 @@ def load_alias_parts(csvpath: str) -> tuple[list[str], list[str]]:
def extract_alias_parts(csv: "_csv._reader"):
return tuple(zip(*((line[0], line[1]) for n, line in enumerate(csv) if n > 0)))
with open(csvpath, "r") as csvfile:
with open(csvpath, "r", encoding="utf-8") as csvfile:
alias_parts = extract_alias_parts(csv.reader(csvfile))
print(f"Loaded {sum(len(part) for part in alias_parts)} alias parts")
@ -262,21 +262,43 @@ def maybe_generate_aliases(
print("Reusing generated aliases")
return False
with open("aliases/usedaliases.txt", "r", encoding="utf-8") as usedaliases_file:
usedaliases = set(
line.rstrip().lower() for line in usedaliases_file if line.count(" ") == 1
)
with open("aliases/suswords.txt", "r", encoding="utf-8") as suswords_file:
suswords = set(line.rstrip() for line in suswords_file)
alias_to_email_address = {}
seed = args.seed or args.csv
for row in csv_contents:
rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
while (
random_alias in alias_to_email_address
and alias_to_email_address[random_alias] != row[KnownColumns.EmailAddress]
):
print(
f"WARNING: rerolling alias for {row[KnownColumns.EmailAddress]} due to collision with {alias_to_email_address[random_alias]}"
)
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}"
while True:
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
if (
random_alias in alias_to_email_address
and alias_to_email_address[random_alias]
!= row[KnownColumns.EmailAddress]
):
print(
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} is already being used by {alias_to_email_address[random_alias]}"
)
elif random_alias in usedaliases:
print(
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} has already been used"
)
elif any(
random_part in suswords for random_part in random_alias.split(" ")
):
print(
f"WARNING: alias for {row[KnownColumns.EmailAddress]} {repr(random_alias)} contains a sus word"
)
break
else:
break
row[KnownColumns.GeneratedAlias] = random_alias
print("Generated an alias for each entry")
@ -553,7 +575,7 @@ def main(argv: list[str]):
raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs())
args = process_args(raw_args)
assert_valid_file_paths(args)
alias_parts = load_alias_parts("aliasparts.csv")
alias_parts = load_alias_parts("aliases/aliasparts.csv")
csv_contents = load_csv_contents(args)
assert_known_google_forms_columns_present(csv_contents)
dynamic_columns = detect_dynamic_columns(csv_contents)