nicer cli args

This commit is contained in:
Ash Garcia 2024-08-11 22:59:51 -07:00
parent 2b1bf885a1
commit d029642e08

View file

@ -8,6 +8,8 @@ import os
from random import Random from random import Random
import shutil import shutil
import sys import sys
import textwrap
from typing import cast
from zipfile import ZipFile from zipfile import ZipFile
import fs.path import fs.path
@ -28,27 +30,104 @@ from simfile.types import Simfile
#################### ####################
class AnonymizeEntriesArgs: class AnonymizeEntriesRawArgs:
data_dir: str | None
csv: str | None
file_uploads: str | None
deanonymized: bool
dry_run: bool
emails: str
output: str
regenerate: bool
seed: str
class AnonymizeEntriesArgs(AnonymizeEntriesRawArgs):
"""Stores the command-line arguments for this script.""" """Stores the command-line arguments for this script."""
csv: str csv: str
files: str file_uploads: str
dry_run: bool
def argparser(): def argparser():
"""Get an ArgumentParser instance for this command-line script.""" """Get an ArgumentParser instance for this command-line script."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser(
parser.add_argument("csv", type=str, help="path to the CSV file of form responses") formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent(
"""\
example:
path/to/folder:
form_responses.csv
file_responses/
Upload A - User 1.zip
Upload B - User 2.zip
etc.
python ./anonymize_entries.py path/to/folder
OR
python ./anonymize_entries.py -c path/to/folder/form_responses.csv -f path/to/folder/file_responses
"""
),
)
parser.add_argument( parser.add_argument(
"files", type=str, help="path to the directory of file responses" "data_dir",
nargs="?",
type=str,
help="directory containing both the CSV form data and the file responses (uploads)",
)
parser.add_argument(
"-c",
"--csv",
type=str,
help="override path to the CSV file of form responses",
)
parser.add_argument(
"-f",
"--file-uploads",
type=str,
help="override path to the directory of file responses (uploads)",
) )
parser.add_argument( parser.add_argument(
"-d", "-d",
"--dry-run", "--dry-run",
action=argparse.BooleanOptionalAction, action=argparse.BooleanOptionalAction,
help="preview changes without writing the file", help="do not create or modify any files",
) )
parser.add_argument(
"-D",
"--deanonymized",
action=argparse.BooleanOptionalAction,
help="skip anonymization of files, simply package them as-is",
)
parser.add_argument(
"-e",
"--emails",
type=str,
help="limit output to files from the specified emails (comma-separated)",
)
parser.add_argument(
"-o",
"--output",
type=str,
default="output/",
help="output directory",
)
parser.add_argument(
"-r",
"--regenerate",
action=argparse.BooleanOptionalAction,
help="force-update generated CSV columns",
)
parser.add_argument(
"-s",
"--seed",
type=str,
help="specify random seed for alias generation (treat this like a password & change it for each round)",
)
return parser return parser
@ -63,7 +142,7 @@ class KnownColumns(enum.StrEnum):
Timestamp = "Timestamp" Timestamp = "Timestamp"
EmailAddress = "Email Address" EmailAddress = "Email Address"
GeneratedAlias = "Generated Alias" GeneratedAlias = "Generated Alias"
IgnoreResubmittedFile = "Ignore Resubmitted File" IgnoreFile = "Ignore File"
# Not persisted: # Not persisted:
ExtractedTo = "Extracted To" ExtractedTo = "Extracted To"
@ -89,9 +168,34 @@ def canonical_simfile_filename(sm: Simfile) -> str:
################ ################
def process_args(args: AnonymizeEntriesRawArgs) -> AnonymizeEntriesArgs:
if not args.csv or not args.file_uploads:
assert (
args.data_dir
), "Positional data_dir argument must be provided if --csv and --file-uploads are not both set"
for dir_entry in os.scandir(args.data_dir):
if not args.csv and dir_entry.is_file() and dir_entry.name.endswith(".csv"):
args.csv = dir_entry.path
if not args.file_uploads and dir_entry.is_dir():
if any(
subdir_entry.name.endswith(".zip")
for subdir_entry in os.scandir(dir_entry.path)
):
args.file_uploads = dir_entry.path
assert args.csv, "Unable to find a CSV file in the provided directory"
assert (
args.file_uploads
), "Unable to find a subdirectory containing ZIP files in the provided directory"
return cast(AnonymizeEntriesArgs, args)
def assert_valid_file_paths(args: AnonymizeEntriesArgs): def assert_valid_file_paths(args: AnonymizeEntriesArgs):
assert os.path.isfile(args.csv), f"{repr(args.csv)} is not a file" assert os.path.isfile(args.csv), f"{repr(args.csv)} is not a file"
assert os.path.isdir(args.files), f"{repr(args.files)} is not a directory" assert os.path.isdir(
args.file_uploads
), f"{repr(args.file_uploads)} is not a directory"
def load_csv_contents(args: AnonymizeEntriesArgs): def load_csv_contents(args: AnonymizeEntriesArgs):
@ -140,7 +244,9 @@ def maybe_generate_aliases(
alias_parts: tuple[list[str], list[str]], alias_parts: tuple[list[str], list[str]],
csv_contents: CsvContents, csv_contents: CsvContents,
) -> ChangedCsvContents: ) -> ChangedCsvContents:
reuse_aliases = KnownColumns.GeneratedAlias in csv_contents[0] reuse_aliases = (
not args.regenerate and KnownColumns.GeneratedAlias in csv_contents[0]
)
if reuse_aliases: if reuse_aliases:
print("Reusing generated aliases") print("Reusing generated aliases")
@ -148,8 +254,10 @@ def maybe_generate_aliases(
alias_to_email_address = {} alias_to_email_address = {}
seed = args.seed or args.csv
for row in csv_contents: for row in csv_contents:
rnd = Random("; ".join([row[KnownColumns.EmailAddress], args.csv, args.files])) rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}" random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}"
while ( while (
random_alias in alias_to_email_address random_alias in alias_to_email_address
@ -165,8 +273,12 @@ def maybe_generate_aliases(
return True return True
def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvContents: def maybe_mark_resubmitted_entries(
reuse_resubmitted = KnownColumns.IgnoreResubmittedFile in csv_contents[0] args: AnonymizeEntriesArgs, csv_contents: CsvContents
) -> ChangedCsvContents:
reuse_resubmitted = (
not args.regenerate and KnownColumns.IgnoreFile in csv_contents[0]
)
if reuse_resubmitted: if reuse_resubmitted:
print("Reusing resubmitted files column") print("Reusing resubmitted files column")
return False return False
@ -185,9 +297,7 @@ def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvConte
most_recent_entry_per_user[user] = timestamp most_recent_entry_per_user[user] = timestamp
elif loop_pass == "mark": elif loop_pass == "mark":
resubmitted = timestamp < most_recent_entry_per_user[user] resubmitted = timestamp < most_recent_entry_per_user[user]
row[KnownColumns.IgnoreResubmittedFile] = ( row[KnownColumns.IgnoreFile] = "resubmitted" if resubmitted else ""
"true" if resubmitted else ""
)
if resubmitted: if resubmitted:
resubmitted_total += 1 resubmitted_total += 1
print(f"Marked {resubmitted_total} resubmitted files to be ignored") print(f"Marked {resubmitted_total} resubmitted files to be ignored")
@ -206,6 +316,32 @@ def maybe_save_generated_columns(args: AnonymizeEntriesArgs, csv_contents: CsvCo
print("Wrote generated columns back to CSV") print("Wrote generated columns back to CSV")
def maybe_mark_unspecified_emails(
args: AnonymizeEntriesArgs, csv_contents: CsvContents
):
if not args.emails:
return
unspecified_total = 0
specified_total = 0
emails = set(args.emails.split(","))
for row in csv_contents:
if not row[KnownColumns.IgnoreFile]:
if row[KnownColumns.EmailAddress] not in emails:
row[KnownColumns.IgnoreFile] = "unspecified"
unspecified_total += 1
else:
specified_total += 1
assert specified_total > 0, "No responses were found from the specified emails"
s = "s" if specified_total != 1 else ""
print(
f"Processing {specified_total} file{s} for specified emails & ignoring {unspecified_total} others"
)
def extract_entries_to_temporary_folder( def extract_entries_to_temporary_folder(
args: AnonymizeEntriesArgs, args: AnonymizeEntriesArgs,
csv_contents: CsvContents, csv_contents: CsvContents,
@ -244,12 +380,14 @@ def extract_entries_to_temporary_folder(
copy_dir(zip_fs, zip_path, temp_fs, canonical_filename) copy_dir(zip_fs, zip_path, temp_fs, canonical_filename)
return canonical_filename return canonical_filename
temp_fs = TempFS(identifier="dimocracy-voucher_anonymized") temp_fs = TempFS(identifier="dimocracy-voucher")
for row in csv_contents: for row in csv_contents:
if row[KnownColumns.IgnoreResubmittedFile]: if row[KnownColumns.IgnoreFile]:
continue continue
zip_absolute_path = os.path.join(args.files, row[dynamic_columns.filename]) zip_absolute_path = os.path.join(
args.file_uploads, row[dynamic_columns.filename]
)
if os.path.isfile(zip_absolute_path): if os.path.isfile(zip_absolute_path):
with open(zip_absolute_path, "rb") as zip_file: with open(zip_absolute_path, "rb") as zip_file:
zip_fs = ZipFS(zip_file) zip_fs = ZipFS(zip_file)
@ -261,11 +399,15 @@ def extract_entries_to_temporary_folder(
return temp_fs return temp_fs
def anonymize_entries( def maybe_anonymize_entries(
args: AnonymizeEntriesArgs, args: AnonymizeEntriesArgs,
csv_contents: CsvContents, csv_contents: CsvContents,
temp_fs: TempFS, temp_fs: TempFS,
): ):
if args.deanonymized:
print("Deanonymized - skipping anonymization step")
return
def maybe_rename_file(absolute_path: str | None, canonical_filename: str): def maybe_rename_file(absolute_path: str | None, canonical_filename: str):
if absolute_path and os.path.basename(absolute_path) != canonical_filename: if absolute_path and os.path.basename(absolute_path) != canonical_filename:
absolute_canonical_path = os.path.join( absolute_canonical_path = os.path.join(
@ -282,7 +424,7 @@ def anonymize_entries(
print(f"Deleted {os.path.relpath(absolute_path, temp_fs.root_path)}") print(f"Deleted {os.path.relpath(absolute_path, temp_fs.root_path)}")
for row in csv_contents: for row in csv_contents:
if row[KnownColumns.IgnoreResubmittedFile]: if row[KnownColumns.IgnoreFile]:
continue continue
absolute_simfile_dir_path = os.path.join( absolute_simfile_dir_path = os.path.join(
@ -376,16 +518,17 @@ def anonymize_entries(
) )
def save_anonymized_files( def maybe_save_anonymized_files(
args: AnonymizeEntriesArgs, args: AnonymizeEntriesArgs,
csv_contents: CsvContents, csv_contents: CsvContents,
temp_fs: TempFS, temp_fs: TempFS,
): ):
if args.dry_run: if args.dry_run:
print("Dry run - not saving anonymized files") print("Dry run - not saving files")
return return
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
output_path = f"./output/anonymized-{timestamp}" de = "de" if args.deanonymized else ""
output_path = f"{args.output}/{de}anonymized-{timestamp}"
shutil.copytree(temp_fs.root_path, output_path) shutil.copytree(temp_fs.root_path, output_path)
print(f"Saved to {os.path.abspath(output_path)}") print(f"Saved to {os.path.abspath(output_path)}")
@ -396,21 +539,26 @@ def save_anonymized_files(
def main(argv: list[str]): def main(argv: list[str]):
args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesArgs()) raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs())
args = process_args(raw_args)
assert_valid_file_paths(args) assert_valid_file_paths(args)
alias_parts = load_alias_parts("aliasparts.csv") alias_parts = load_alias_parts("aliasparts.csv")
csv_contents = load_csv_contents(args) csv_contents = load_csv_contents(args)
assert_known_google_forms_columns_present(csv_contents) assert_known_google_forms_columns_present(csv_contents)
dynamic_columns = detect_dynamic_columns(csv_contents) dynamic_columns = detect_dynamic_columns(csv_contents)
# Generate & save CSV columns
csv_contents_changed = maybe_generate_aliases(args, alias_parts, csv_contents) csv_contents_changed = maybe_generate_aliases(args, alias_parts, csv_contents)
csv_contents_changed |= maybe_mark_resubmitted_entries(csv_contents) csv_contents_changed |= maybe_mark_resubmitted_entries(args, csv_contents)
if csv_contents_changed: if csv_contents_changed:
maybe_save_generated_columns(args, csv_contents) maybe_save_generated_columns(args, csv_contents)
# Generate temporary CSV columns
maybe_mark_unspecified_emails(args, csv_contents)
temp_fs = extract_entries_to_temporary_folder(args, csv_contents, dynamic_columns) temp_fs = extract_entries_to_temporary_folder(args, csv_contents, dynamic_columns)
anonymize_entries(args, csv_contents, temp_fs) maybe_anonymize_entries(args, csv_contents, temp_fs)
save_anonymized_files(args, csv_contents, temp_fs) maybe_save_anonymized_files(args, csv_contents, temp_fs)
if __name__ == "__main__": if __name__ == "__main__":