nicer cli args
This commit is contained in:
parent
2b1bf885a1
commit
d029642e08
1 changed files with 176 additions and 28 deletions
|
@ -8,6 +8,8 @@ import os
|
||||||
from random import Random
|
from random import Random
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
import textwrap
|
||||||
|
from typing import cast
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
import fs.path
|
import fs.path
|
||||||
|
@ -28,27 +30,104 @@ from simfile.types import Simfile
|
||||||
####################
|
####################
|
||||||
|
|
||||||
|
|
||||||
class AnonymizeEntriesArgs:
|
class AnonymizeEntriesRawArgs:
|
||||||
|
data_dir: str | None
|
||||||
|
csv: str | None
|
||||||
|
file_uploads: str | None
|
||||||
|
deanonymized: bool
|
||||||
|
dry_run: bool
|
||||||
|
emails: str
|
||||||
|
output: str
|
||||||
|
regenerate: bool
|
||||||
|
seed: str
|
||||||
|
|
||||||
|
|
||||||
|
class AnonymizeEntriesArgs(AnonymizeEntriesRawArgs):
|
||||||
"""Stores the command-line arguments for this script."""
|
"""Stores the command-line arguments for this script."""
|
||||||
|
|
||||||
csv: str
|
csv: str
|
||||||
files: str
|
file_uploads: str
|
||||||
dry_run: bool
|
|
||||||
|
|
||||||
|
|
||||||
def argparser():
|
def argparser():
|
||||||
"""Get an ArgumentParser instance for this command-line script."""
|
"""Get an ArgumentParser instance for this command-line script."""
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("csv", type=str, help="path to the CSV file of form responses")
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=textwrap.dedent(
|
||||||
|
"""\
|
||||||
|
example:
|
||||||
|
|
||||||
|
path/to/folder:
|
||||||
|
├ form_responses.csv
|
||||||
|
└ file_responses/
|
||||||
|
├ Upload A - User 1.zip
|
||||||
|
├ Upload B - User 2.zip
|
||||||
|
└ etc.
|
||||||
|
|
||||||
|
python ./anonymize_entries.py path/to/folder
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
|
python ./anonymize_entries.py -c path/to/folder/form_responses.csv -f path/to/folder/file_responses
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"files", type=str, help="path to the directory of file responses"
|
"data_dir",
|
||||||
|
nargs="?",
|
||||||
|
type=str,
|
||||||
|
help="directory containing both the CSV form data and the file responses (uploads)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-c",
|
||||||
|
"--csv",
|
||||||
|
type=str,
|
||||||
|
help="override path to the CSV file of form responses",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-f",
|
||||||
|
"--file-uploads",
|
||||||
|
type=str,
|
||||||
|
help="override path to the directory of file responses (uploads)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
"--dry-run",
|
"--dry-run",
|
||||||
action=argparse.BooleanOptionalAction,
|
action=argparse.BooleanOptionalAction,
|
||||||
help="preview changes without writing the file",
|
help="do not create or modify any files",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-D",
|
||||||
|
"--deanonymized",
|
||||||
|
action=argparse.BooleanOptionalAction,
|
||||||
|
help="skip anonymization of files, simply package them as-is",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--emails",
|
||||||
|
type=str,
|
||||||
|
help="limit output to files from the specified emails (comma-separated)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output",
|
||||||
|
type=str,
|
||||||
|
default="output/",
|
||||||
|
help="output directory",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-r",
|
||||||
|
"--regenerate",
|
||||||
|
action=argparse.BooleanOptionalAction,
|
||||||
|
help="force-update generated CSV columns",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--seed",
|
||||||
|
type=str,
|
||||||
|
help="specify random seed for alias generation (treat this like a password & change it for each round)",
|
||||||
|
)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,7 +142,7 @@ class KnownColumns(enum.StrEnum):
|
||||||
Timestamp = "Timestamp"
|
Timestamp = "Timestamp"
|
||||||
EmailAddress = "Email Address"
|
EmailAddress = "Email Address"
|
||||||
GeneratedAlias = "Generated Alias"
|
GeneratedAlias = "Generated Alias"
|
||||||
IgnoreResubmittedFile = "Ignore Resubmitted File"
|
IgnoreFile = "Ignore File"
|
||||||
# Not persisted:
|
# Not persisted:
|
||||||
ExtractedTo = "Extracted To"
|
ExtractedTo = "Extracted To"
|
||||||
|
|
||||||
|
@ -89,9 +168,34 @@ def canonical_simfile_filename(sm: Simfile) -> str:
|
||||||
################
|
################
|
||||||
|
|
||||||
|
|
||||||
|
def process_args(args: AnonymizeEntriesRawArgs) -> AnonymizeEntriesArgs:
|
||||||
|
if not args.csv or not args.file_uploads:
|
||||||
|
assert (
|
||||||
|
args.data_dir
|
||||||
|
), "Positional data_dir argument must be provided if --csv and --file-uploads are not both set"
|
||||||
|
for dir_entry in os.scandir(args.data_dir):
|
||||||
|
if not args.csv and dir_entry.is_file() and dir_entry.name.endswith(".csv"):
|
||||||
|
args.csv = dir_entry.path
|
||||||
|
if not args.file_uploads and dir_entry.is_dir():
|
||||||
|
if any(
|
||||||
|
subdir_entry.name.endswith(".zip")
|
||||||
|
for subdir_entry in os.scandir(dir_entry.path)
|
||||||
|
):
|
||||||
|
args.file_uploads = dir_entry.path
|
||||||
|
|
||||||
|
assert args.csv, "Unable to find a CSV file in the provided directory"
|
||||||
|
assert (
|
||||||
|
args.file_uploads
|
||||||
|
), "Unable to find a subdirectory containing ZIP files in the provided directory"
|
||||||
|
|
||||||
|
return cast(AnonymizeEntriesArgs, args)
|
||||||
|
|
||||||
|
|
||||||
def assert_valid_file_paths(args: AnonymizeEntriesArgs):
|
def assert_valid_file_paths(args: AnonymizeEntriesArgs):
|
||||||
assert os.path.isfile(args.csv), f"{repr(args.csv)} is not a file"
|
assert os.path.isfile(args.csv), f"{repr(args.csv)} is not a file"
|
||||||
assert os.path.isdir(args.files), f"{repr(args.files)} is not a directory"
|
assert os.path.isdir(
|
||||||
|
args.file_uploads
|
||||||
|
), f"{repr(args.file_uploads)} is not a directory"
|
||||||
|
|
||||||
|
|
||||||
def load_csv_contents(args: AnonymizeEntriesArgs):
|
def load_csv_contents(args: AnonymizeEntriesArgs):
|
||||||
|
@ -140,7 +244,9 @@ def maybe_generate_aliases(
|
||||||
alias_parts: tuple[list[str], list[str]],
|
alias_parts: tuple[list[str], list[str]],
|
||||||
csv_contents: CsvContents,
|
csv_contents: CsvContents,
|
||||||
) -> ChangedCsvContents:
|
) -> ChangedCsvContents:
|
||||||
reuse_aliases = KnownColumns.GeneratedAlias in csv_contents[0]
|
reuse_aliases = (
|
||||||
|
not args.regenerate and KnownColumns.GeneratedAlias in csv_contents[0]
|
||||||
|
)
|
||||||
|
|
||||||
if reuse_aliases:
|
if reuse_aliases:
|
||||||
print("Reusing generated aliases")
|
print("Reusing generated aliases")
|
||||||
|
@ -148,8 +254,10 @@ def maybe_generate_aliases(
|
||||||
|
|
||||||
alias_to_email_address = {}
|
alias_to_email_address = {}
|
||||||
|
|
||||||
|
seed = args.seed or args.csv
|
||||||
|
|
||||||
for row in csv_contents:
|
for row in csv_contents:
|
||||||
rnd = Random("; ".join([row[KnownColumns.EmailAddress], args.csv, args.files]))
|
rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
|
||||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}"
|
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}"
|
||||||
while (
|
while (
|
||||||
random_alias in alias_to_email_address
|
random_alias in alias_to_email_address
|
||||||
|
@ -165,8 +273,12 @@ def maybe_generate_aliases(
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvContents:
|
def maybe_mark_resubmitted_entries(
|
||||||
reuse_resubmitted = KnownColumns.IgnoreResubmittedFile in csv_contents[0]
|
args: AnonymizeEntriesArgs, csv_contents: CsvContents
|
||||||
|
) -> ChangedCsvContents:
|
||||||
|
reuse_resubmitted = (
|
||||||
|
not args.regenerate and KnownColumns.IgnoreFile in csv_contents[0]
|
||||||
|
)
|
||||||
if reuse_resubmitted:
|
if reuse_resubmitted:
|
||||||
print("Reusing resubmitted files column")
|
print("Reusing resubmitted files column")
|
||||||
return False
|
return False
|
||||||
|
@ -185,9 +297,7 @@ def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvConte
|
||||||
most_recent_entry_per_user[user] = timestamp
|
most_recent_entry_per_user[user] = timestamp
|
||||||
elif loop_pass == "mark":
|
elif loop_pass == "mark":
|
||||||
resubmitted = timestamp < most_recent_entry_per_user[user]
|
resubmitted = timestamp < most_recent_entry_per_user[user]
|
||||||
row[KnownColumns.IgnoreResubmittedFile] = (
|
row[KnownColumns.IgnoreFile] = "resubmitted" if resubmitted else ""
|
||||||
"true" if resubmitted else ""
|
|
||||||
)
|
|
||||||
if resubmitted:
|
if resubmitted:
|
||||||
resubmitted_total += 1
|
resubmitted_total += 1
|
||||||
print(f"Marked {resubmitted_total} resubmitted files to be ignored")
|
print(f"Marked {resubmitted_total} resubmitted files to be ignored")
|
||||||
|
@ -206,6 +316,32 @@ def maybe_save_generated_columns(args: AnonymizeEntriesArgs, csv_contents: CsvCo
|
||||||
print("Wrote generated columns back to CSV")
|
print("Wrote generated columns back to CSV")
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_mark_unspecified_emails(
|
||||||
|
args: AnonymizeEntriesArgs, csv_contents: CsvContents
|
||||||
|
):
|
||||||
|
if not args.emails:
|
||||||
|
return
|
||||||
|
|
||||||
|
unspecified_total = 0
|
||||||
|
specified_total = 0
|
||||||
|
emails = set(args.emails.split(","))
|
||||||
|
|
||||||
|
for row in csv_contents:
|
||||||
|
if not row[KnownColumns.IgnoreFile]:
|
||||||
|
if row[KnownColumns.EmailAddress] not in emails:
|
||||||
|
row[KnownColumns.IgnoreFile] = "unspecified"
|
||||||
|
unspecified_total += 1
|
||||||
|
else:
|
||||||
|
specified_total += 1
|
||||||
|
|
||||||
|
assert specified_total > 0, "No responses were found from the specified emails"
|
||||||
|
|
||||||
|
s = "s" if specified_total != 1 else ""
|
||||||
|
print(
|
||||||
|
f"Processing {specified_total} file{s} for specified emails & ignoring {unspecified_total} others"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_entries_to_temporary_folder(
|
def extract_entries_to_temporary_folder(
|
||||||
args: AnonymizeEntriesArgs,
|
args: AnonymizeEntriesArgs,
|
||||||
csv_contents: CsvContents,
|
csv_contents: CsvContents,
|
||||||
|
@ -244,12 +380,14 @@ def extract_entries_to_temporary_folder(
|
||||||
copy_dir(zip_fs, zip_path, temp_fs, canonical_filename)
|
copy_dir(zip_fs, zip_path, temp_fs, canonical_filename)
|
||||||
return canonical_filename
|
return canonical_filename
|
||||||
|
|
||||||
temp_fs = TempFS(identifier="dimocracy-voucher_anonymized")
|
temp_fs = TempFS(identifier="dimocracy-voucher")
|
||||||
|
|
||||||
for row in csv_contents:
|
for row in csv_contents:
|
||||||
if row[KnownColumns.IgnoreResubmittedFile]:
|
if row[KnownColumns.IgnoreFile]:
|
||||||
continue
|
continue
|
||||||
zip_absolute_path = os.path.join(args.files, row[dynamic_columns.filename])
|
zip_absolute_path = os.path.join(
|
||||||
|
args.file_uploads, row[dynamic_columns.filename]
|
||||||
|
)
|
||||||
if os.path.isfile(zip_absolute_path):
|
if os.path.isfile(zip_absolute_path):
|
||||||
with open(zip_absolute_path, "rb") as zip_file:
|
with open(zip_absolute_path, "rb") as zip_file:
|
||||||
zip_fs = ZipFS(zip_file)
|
zip_fs = ZipFS(zip_file)
|
||||||
|
@ -261,11 +399,15 @@ def extract_entries_to_temporary_folder(
|
||||||
return temp_fs
|
return temp_fs
|
||||||
|
|
||||||
|
|
||||||
def anonymize_entries(
|
def maybe_anonymize_entries(
|
||||||
args: AnonymizeEntriesArgs,
|
args: AnonymizeEntriesArgs,
|
||||||
csv_contents: CsvContents,
|
csv_contents: CsvContents,
|
||||||
temp_fs: TempFS,
|
temp_fs: TempFS,
|
||||||
):
|
):
|
||||||
|
if args.deanonymized:
|
||||||
|
print("Deanonymized - skipping anonymization step")
|
||||||
|
return
|
||||||
|
|
||||||
def maybe_rename_file(absolute_path: str | None, canonical_filename: str):
|
def maybe_rename_file(absolute_path: str | None, canonical_filename: str):
|
||||||
if absolute_path and os.path.basename(absolute_path) != canonical_filename:
|
if absolute_path and os.path.basename(absolute_path) != canonical_filename:
|
||||||
absolute_canonical_path = os.path.join(
|
absolute_canonical_path = os.path.join(
|
||||||
|
@ -282,7 +424,7 @@ def anonymize_entries(
|
||||||
print(f"Deleted {os.path.relpath(absolute_path, temp_fs.root_path)}")
|
print(f"Deleted {os.path.relpath(absolute_path, temp_fs.root_path)}")
|
||||||
|
|
||||||
for row in csv_contents:
|
for row in csv_contents:
|
||||||
if row[KnownColumns.IgnoreResubmittedFile]:
|
if row[KnownColumns.IgnoreFile]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
absolute_simfile_dir_path = os.path.join(
|
absolute_simfile_dir_path = os.path.join(
|
||||||
|
@ -376,16 +518,17 @@ def anonymize_entries(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def save_anonymized_files(
|
def maybe_save_anonymized_files(
|
||||||
args: AnonymizeEntriesArgs,
|
args: AnonymizeEntriesArgs,
|
||||||
csv_contents: CsvContents,
|
csv_contents: CsvContents,
|
||||||
temp_fs: TempFS,
|
temp_fs: TempFS,
|
||||||
):
|
):
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
print("Dry run - not saving anonymized files")
|
print("Dry run - not saving files")
|
||||||
return
|
return
|
||||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
output_path = f"./output/anonymized-{timestamp}"
|
de = "de" if args.deanonymized else ""
|
||||||
|
output_path = f"{args.output}/{de}anonymized-{timestamp}"
|
||||||
shutil.copytree(temp_fs.root_path, output_path)
|
shutil.copytree(temp_fs.root_path, output_path)
|
||||||
print(f"Saved to {os.path.abspath(output_path)}")
|
print(f"Saved to {os.path.abspath(output_path)}")
|
||||||
|
|
||||||
|
@ -396,21 +539,26 @@ def save_anonymized_files(
|
||||||
|
|
||||||
|
|
||||||
def main(argv: list[str]):
|
def main(argv: list[str]):
|
||||||
args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesArgs())
|
raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs())
|
||||||
|
args = process_args(raw_args)
|
||||||
assert_valid_file_paths(args)
|
assert_valid_file_paths(args)
|
||||||
alias_parts = load_alias_parts("aliasparts.csv")
|
alias_parts = load_alias_parts("aliasparts.csv")
|
||||||
csv_contents = load_csv_contents(args)
|
csv_contents = load_csv_contents(args)
|
||||||
assert_known_google_forms_columns_present(csv_contents)
|
assert_known_google_forms_columns_present(csv_contents)
|
||||||
dynamic_columns = detect_dynamic_columns(csv_contents)
|
dynamic_columns = detect_dynamic_columns(csv_contents)
|
||||||
|
|
||||||
|
# Generate & save CSV columns
|
||||||
csv_contents_changed = maybe_generate_aliases(args, alias_parts, csv_contents)
|
csv_contents_changed = maybe_generate_aliases(args, alias_parts, csv_contents)
|
||||||
csv_contents_changed |= maybe_mark_resubmitted_entries(csv_contents)
|
csv_contents_changed |= maybe_mark_resubmitted_entries(args, csv_contents)
|
||||||
if csv_contents_changed:
|
if csv_contents_changed:
|
||||||
maybe_save_generated_columns(args, csv_contents)
|
maybe_save_generated_columns(args, csv_contents)
|
||||||
|
|
||||||
|
# Generate temporary CSV columns
|
||||||
|
maybe_mark_unspecified_emails(args, csv_contents)
|
||||||
|
|
||||||
temp_fs = extract_entries_to_temporary_folder(args, csv_contents, dynamic_columns)
|
temp_fs = extract_entries_to_temporary_folder(args, csv_contents, dynamic_columns)
|
||||||
anonymize_entries(args, csv_contents, temp_fs)
|
maybe_anonymize_entries(args, csv_contents, temp_fs)
|
||||||
save_anonymized_files(args, csv_contents, temp_fs)
|
maybe_save_anonymized_files(args, csv_contents, temp_fs)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue