update script to use alias directory
This commit is contained in:
parent
63a5727f50
commit
d5c682cd40
1 changed files with 33 additions and 11 deletions
|
@ -217,7 +217,7 @@ def load_alias_parts(csvpath: str) -> tuple[list[str], list[str]]:
|
||||||
def extract_alias_parts(csv: "_csv._reader"):
|
def extract_alias_parts(csv: "_csv._reader"):
|
||||||
return tuple(zip(*((line[0], line[1]) for n, line in enumerate(csv) if n > 0)))
|
return tuple(zip(*((line[0], line[1]) for n, line in enumerate(csv) if n > 0)))
|
||||||
|
|
||||||
with open(csvpath, "r") as csvfile:
|
with open(csvpath, "r", encoding="utf-8") as csvfile:
|
||||||
alias_parts = extract_alias_parts(csv.reader(csvfile))
|
alias_parts = extract_alias_parts(csv.reader(csvfile))
|
||||||
|
|
||||||
print(f"Loaded {sum(len(part) for part in alias_parts)} alias parts")
|
print(f"Loaded {sum(len(part) for part in alias_parts)} alias parts")
|
||||||
|
@ -262,21 +262,43 @@ def maybe_generate_aliases(
|
||||||
print("Reusing generated aliases")
|
print("Reusing generated aliases")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
with open("aliases/usedaliases.txt", "r", encoding="utf-8") as usedaliases_file:
|
||||||
|
usedaliases = set(
|
||||||
|
line.rstrip().lower() for line in usedaliases_file if line.count(" ") == 1
|
||||||
|
)
|
||||||
|
|
||||||
|
with open("aliases/suswords.txt", "r", encoding="utf-8") as suswords_file:
|
||||||
|
suswords = set(line.rstrip() for line in suswords_file)
|
||||||
|
|
||||||
alias_to_email_address = {}
|
alias_to_email_address = {}
|
||||||
|
|
||||||
seed = args.seed or args.csv
|
seed = args.seed or args.csv
|
||||||
|
|
||||||
for row in csv_contents:
|
for row in csv_contents:
|
||||||
rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
|
rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
|
||||||
|
while True:
|
||||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
|
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
|
||||||
while (
|
if (
|
||||||
random_alias in alias_to_email_address
|
random_alias in alias_to_email_address
|
||||||
and alias_to_email_address[random_alias] != row[KnownColumns.EmailAddress]
|
and alias_to_email_address[random_alias]
|
||||||
|
!= row[KnownColumns.EmailAddress]
|
||||||
):
|
):
|
||||||
print(
|
print(
|
||||||
f"WARNING: rerolling alias for {row[KnownColumns.EmailAddress]} due to collision with {alias_to_email_address[random_alias]}"
|
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} is already being used by {alias_to_email_address[random_alias]}"
|
||||||
)
|
)
|
||||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}"
|
elif random_alias in usedaliases:
|
||||||
|
print(
|
||||||
|
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} has already been used"
|
||||||
|
)
|
||||||
|
elif any(
|
||||||
|
random_part in suswords for random_part in random_alias.split(" ")
|
||||||
|
):
|
||||||
|
print(
|
||||||
|
f"WARNING: alias for {row[KnownColumns.EmailAddress]} {repr(random_alias)} contains a sus word"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
row[KnownColumns.GeneratedAlias] = random_alias
|
row[KnownColumns.GeneratedAlias] = random_alias
|
||||||
|
|
||||||
print("Generated an alias for each entry")
|
print("Generated an alias for each entry")
|
||||||
|
@ -553,7 +575,7 @@ def main(argv: list[str]):
|
||||||
raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs())
|
raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs())
|
||||||
args = process_args(raw_args)
|
args = process_args(raw_args)
|
||||||
assert_valid_file_paths(args)
|
assert_valid_file_paths(args)
|
||||||
alias_parts = load_alias_parts("aliasparts.csv")
|
alias_parts = load_alias_parts("aliases/aliasparts.csv")
|
||||||
csv_contents = load_csv_contents(args)
|
csv_contents = load_csv_contents(args)
|
||||||
assert_known_google_forms_columns_present(csv_contents)
|
assert_known_google_forms_columns_present(csv_contents)
|
||||||
dynamic_columns = detect_dynamic_columns(csv_contents)
|
dynamic_columns = detect_dynamic_columns(csv_contents)
|
||||||
|
|
Loading…
Reference in a new issue