update script to use alias directory
This commit is contained in:
parent
63a5727f50
commit
d5c682cd40
1 changed files with 33 additions and 11 deletions
|
@ -217,7 +217,7 @@ def load_alias_parts(csvpath: str) -> tuple[list[str], list[str]]:
|
|||
def extract_alias_parts(csv: "_csv._reader"):
|
||||
return tuple(zip(*((line[0], line[1]) for n, line in enumerate(csv) if n > 0)))
|
||||
|
||||
with open(csvpath, "r") as csvfile:
|
||||
with open(csvpath, "r", encoding="utf-8") as csvfile:
|
||||
alias_parts = extract_alias_parts(csv.reader(csvfile))
|
||||
|
||||
print(f"Loaded {sum(len(part) for part in alias_parts)} alias parts")
|
||||
|
@ -262,21 +262,43 @@ def maybe_generate_aliases(
|
|||
print("Reusing generated aliases")
|
||||
return False
|
||||
|
||||
with open("aliases/usedaliases.txt", "r", encoding="utf-8") as usedaliases_file:
|
||||
usedaliases = set(
|
||||
line.rstrip().lower() for line in usedaliases_file if line.count(" ") == 1
|
||||
)
|
||||
|
||||
with open("aliases/suswords.txt", "r", encoding="utf-8") as suswords_file:
|
||||
suswords = set(line.rstrip() for line in suswords_file)
|
||||
|
||||
alias_to_email_address = {}
|
||||
|
||||
seed = args.seed or args.csv
|
||||
|
||||
for row in csv_contents:
|
||||
rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
|
||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
|
||||
while (
|
||||
random_alias in alias_to_email_address
|
||||
and alias_to_email_address[random_alias] != row[KnownColumns.EmailAddress]
|
||||
):
|
||||
print(
|
||||
f"WARNING: rerolling alias for {row[KnownColumns.EmailAddress]} due to collision with {alias_to_email_address[random_alias]}"
|
||||
)
|
||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}"
|
||||
while True:
|
||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
|
||||
if (
|
||||
random_alias in alias_to_email_address
|
||||
and alias_to_email_address[random_alias]
|
||||
!= row[KnownColumns.EmailAddress]
|
||||
):
|
||||
print(
|
||||
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} is already being used by {alias_to_email_address[random_alias]}"
|
||||
)
|
||||
elif random_alias in usedaliases:
|
||||
print(
|
||||
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} has already been used"
|
||||
)
|
||||
elif any(
|
||||
random_part in suswords for random_part in random_alias.split(" ")
|
||||
):
|
||||
print(
|
||||
f"WARNING: alias for {row[KnownColumns.EmailAddress]} {repr(random_alias)} contains a sus word"
|
||||
)
|
||||
break
|
||||
else:
|
||||
break
|
||||
row[KnownColumns.GeneratedAlias] = random_alias
|
||||
|
||||
print("Generated an alias for each entry")
|
||||
|
@ -553,7 +575,7 @@ def main(argv: list[str]):
|
|||
raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs())
|
||||
args = process_args(raw_args)
|
||||
assert_valid_file_paths(args)
|
||||
alias_parts = load_alias_parts("aliasparts.csv")
|
||||
alias_parts = load_alias_parts("aliases/aliasparts.csv")
|
||||
csv_contents = load_csv_contents(args)
|
||||
assert_known_google_forms_columns_present(csv_contents)
|
||||
dynamic_columns = detect_dynamic_columns(csv_contents)
|
||||
|
|
Loading…
Reference in a new issue