nicer cli args
This commit is contained in:
		
							parent
							
								
									2b1bf885a1
								
							
						
					
					
						commit
						d029642e08
					
				
					 1 changed files with 176 additions and 28 deletions
				
			
		|  | @ -8,6 +8,8 @@ import os | |||
| from random import Random | ||||
| import shutil | ||||
| import sys | ||||
| import textwrap | ||||
| from typing import cast | ||||
| from zipfile import ZipFile | ||||
| 
 | ||||
| import fs.path | ||||
|  | @ -28,27 +30,104 @@ from simfile.types import Simfile | |||
| #################### | ||||
| 
 | ||||
| 
 | ||||
| class AnonymizeEntriesArgs: | ||||
| class AnonymizeEntriesRawArgs: | ||||
|     data_dir: str | None | ||||
|     csv: str | None | ||||
|     file_uploads: str | None | ||||
|     deanonymized: bool | ||||
|     dry_run: bool | ||||
|     emails: str | ||||
|     output: str | ||||
|     regenerate: bool | ||||
|     seed: str | ||||
| 
 | ||||
| 
 | ||||
| class AnonymizeEntriesArgs(AnonymizeEntriesRawArgs): | ||||
|     """Stores the command-line arguments for this script.""" | ||||
| 
 | ||||
|     csv: str | ||||
|     files: str | ||||
|     dry_run: bool | ||||
|     file_uploads: str | ||||
| 
 | ||||
| 
 | ||||
| def argparser(): | ||||
|     """Get an ArgumentParser instance for this command-line script.""" | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("csv", type=str, help="path to the CSV file of form responses") | ||||
|     parser = argparse.ArgumentParser( | ||||
|         formatter_class=argparse.RawDescriptionHelpFormatter, | ||||
|         epilog=textwrap.dedent( | ||||
|             """\ | ||||
|             example: | ||||
|              | ||||
|                 path/to/folder: | ||||
|                 ├ form_responses.csv | ||||
|                 └ file_responses/ | ||||
|                     ├ Upload A - User 1.zip | ||||
|                     ├ Upload B - User 2.zip | ||||
|                     └ etc. | ||||
|          | ||||
|                 python ./anonymize_entries.py path/to/folder | ||||
|              | ||||
|                     OR | ||||
| 
 | ||||
|                 python ./anonymize_entries.py -c path/to/folder/form_responses.csv -f path/to/folder/file_responses | ||||
|         """ | ||||
|         ), | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "files", type=str, help="path to the directory of file responses" | ||||
|         "data_dir", | ||||
|         nargs="?", | ||||
|         type=str, | ||||
|         help="directory containing both the CSV form data and the file responses (uploads)", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-c", | ||||
|         "--csv", | ||||
|         type=str, | ||||
|         help="override path to the CSV file of form responses", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-f", | ||||
|         "--file-uploads", | ||||
|         type=str, | ||||
|         help="override path to the directory of file responses (uploads)", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-d", | ||||
|         "--dry-run", | ||||
|         action=argparse.BooleanOptionalAction, | ||||
|         help="preview changes without writing the file", | ||||
|         help="do not create or modify any files", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-D", | ||||
|         "--deanonymized", | ||||
|         action=argparse.BooleanOptionalAction, | ||||
|         help="skip anonymization of files, simply package them as-is", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-e", | ||||
|         "--emails", | ||||
|         type=str, | ||||
|         help="limit output to files from the specified emails (comma-separated)", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-o", | ||||
|         "--output", | ||||
|         type=str, | ||||
|         default="output/", | ||||
|         help="output directory", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-r", | ||||
|         "--regenerate", | ||||
|         action=argparse.BooleanOptionalAction, | ||||
|         help="force-update generated CSV columns", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-s", | ||||
|         "--seed", | ||||
|         type=str, | ||||
|         help="specify random seed for alias generation (treat this like a password & change it for each round)", | ||||
|     ) | ||||
| 
 | ||||
|     return parser | ||||
| 
 | ||||
| 
 | ||||
|  | @ -63,7 +142,7 @@ class KnownColumns(enum.StrEnum): | |||
|     Timestamp = "Timestamp" | ||||
|     EmailAddress = "Email Address" | ||||
|     GeneratedAlias = "Generated Alias" | ||||
|     IgnoreResubmittedFile = "Ignore Resubmitted File" | ||||
|     IgnoreFile = "Ignore File" | ||||
|     # Not persisted: | ||||
|     ExtractedTo = "Extracted To" | ||||
| 
 | ||||
|  | @ -89,9 +168,34 @@ def canonical_simfile_filename(sm: Simfile) -> str: | |||
| ################ | ||||
| 
 | ||||
| 
 | ||||
| def process_args(args: AnonymizeEntriesRawArgs) -> AnonymizeEntriesArgs: | ||||
|     if not args.csv or not args.file_uploads: | ||||
|         assert ( | ||||
|             args.data_dir | ||||
|         ), "Positional data_dir argument must be provided if --csv and --file-uploads are not both set" | ||||
|         for dir_entry in os.scandir(args.data_dir): | ||||
|             if not args.csv and dir_entry.is_file() and dir_entry.name.endswith(".csv"): | ||||
|                 args.csv = dir_entry.path | ||||
|             if not args.file_uploads and dir_entry.is_dir(): | ||||
|                 if any( | ||||
|                     subdir_entry.name.endswith(".zip") | ||||
|                     for subdir_entry in os.scandir(dir_entry.path) | ||||
|                 ): | ||||
|                     args.file_uploads = dir_entry.path | ||||
| 
 | ||||
|     assert args.csv, "Unable to find a CSV file in the provided directory" | ||||
|     assert ( | ||||
|         args.file_uploads | ||||
|     ), "Unable to find a subdirectory containing ZIP files in the provided directory" | ||||
| 
 | ||||
|     return cast(AnonymizeEntriesArgs, args) | ||||
| 
 | ||||
| 
 | ||||
| def assert_valid_file_paths(args: AnonymizeEntriesArgs): | ||||
|     assert os.path.isfile(args.csv), f"{repr(args.csv)} is not a file" | ||||
|     assert os.path.isdir(args.files), f"{repr(args.files)} is not a directory" | ||||
|     assert os.path.isdir( | ||||
|         args.file_uploads | ||||
|     ), f"{repr(args.file_uploads)} is not a directory" | ||||
| 
 | ||||
| 
 | ||||
| def load_csv_contents(args: AnonymizeEntriesArgs): | ||||
|  | @ -140,7 +244,9 @@ def maybe_generate_aliases( | |||
|     alias_parts: tuple[list[str], list[str]], | ||||
|     csv_contents: CsvContents, | ||||
| ) -> ChangedCsvContents: | ||||
|     reuse_aliases = KnownColumns.GeneratedAlias in csv_contents[0] | ||||
|     reuse_aliases = ( | ||||
|         not args.regenerate and KnownColumns.GeneratedAlias in csv_contents[0] | ||||
|     ) | ||||
| 
 | ||||
|     if reuse_aliases: | ||||
|         print("Reusing generated aliases") | ||||
|  | @ -148,8 +254,10 @@ def maybe_generate_aliases( | |||
| 
 | ||||
|     alias_to_email_address = {} | ||||
| 
 | ||||
|     seed = args.seed or args.csv | ||||
| 
 | ||||
|     for row in csv_contents: | ||||
|         rnd = Random("; ".join([row[KnownColumns.EmailAddress], args.csv, args.files])) | ||||
|         rnd = Random(",".join([row[KnownColumns.EmailAddress], seed])) | ||||
|         random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[0])}" | ||||
|         while ( | ||||
|             random_alias in alias_to_email_address | ||||
|  | @ -165,8 +273,12 @@ def maybe_generate_aliases( | |||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvContents: | ||||
|     reuse_resubmitted = KnownColumns.IgnoreResubmittedFile in csv_contents[0] | ||||
| def maybe_mark_resubmitted_entries( | ||||
|     args: AnonymizeEntriesArgs, csv_contents: CsvContents | ||||
| ) -> ChangedCsvContents: | ||||
|     reuse_resubmitted = ( | ||||
|         not args.regenerate and KnownColumns.IgnoreFile in csv_contents[0] | ||||
|     ) | ||||
|     if reuse_resubmitted: | ||||
|         print("Reusing resubmitted files column") | ||||
|         return False | ||||
|  | @ -185,9 +297,7 @@ def maybe_mark_resubmitted_entries(csv_contents: CsvContents) -> ChangedCsvConte | |||
|                         most_recent_entry_per_user[user] = timestamp | ||||
|                 elif loop_pass == "mark": | ||||
|                     resubmitted = timestamp < most_recent_entry_per_user[user] | ||||
|                     row[KnownColumns.IgnoreResubmittedFile] = ( | ||||
|                         "true" if resubmitted else "" | ||||
|                     ) | ||||
|                     row[KnownColumns.IgnoreFile] = "resubmitted" if resubmitted else "" | ||||
|                     if resubmitted: | ||||
|                         resubmitted_total += 1 | ||||
|         print(f"Marked {resubmitted_total} resubmitted files to be ignored") | ||||
|  | @ -206,6 +316,32 @@ def maybe_save_generated_columns(args: AnonymizeEntriesArgs, csv_contents: CsvCo | |||
|         print("Wrote generated columns back to CSV") | ||||
| 
 | ||||
| 
 | ||||
| def maybe_mark_unspecified_emails( | ||||
|     args: AnonymizeEntriesArgs, csv_contents: CsvContents | ||||
| ): | ||||
|     if not args.emails: | ||||
|         return | ||||
| 
 | ||||
|     unspecified_total = 0 | ||||
|     specified_total = 0 | ||||
|     emails = set(args.emails.split(",")) | ||||
| 
 | ||||
|     for row in csv_contents: | ||||
|         if not row[KnownColumns.IgnoreFile]: | ||||
|             if row[KnownColumns.EmailAddress] not in emails: | ||||
|                 row[KnownColumns.IgnoreFile] = "unspecified" | ||||
|                 unspecified_total += 1 | ||||
|             else: | ||||
|                 specified_total += 1 | ||||
| 
 | ||||
|     assert specified_total > 0, "No responses were found from the specified emails" | ||||
| 
 | ||||
|     s = "s" if specified_total != 1 else "" | ||||
|     print( | ||||
|         f"Processing {specified_total} file{s} for specified emails & ignoring {unspecified_total} others" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def extract_entries_to_temporary_folder( | ||||
|     args: AnonymizeEntriesArgs, | ||||
|     csv_contents: CsvContents, | ||||
|  | @ -244,12 +380,14 @@ def extract_entries_to_temporary_folder( | |||
|         copy_dir(zip_fs, zip_path, temp_fs, canonical_filename) | ||||
|         return canonical_filename | ||||
| 
 | ||||
|     temp_fs = TempFS(identifier="dimocracy-voucher_anonymized") | ||||
|     temp_fs = TempFS(identifier="dimocracy-voucher") | ||||
| 
 | ||||
|     for row in csv_contents: | ||||
|         if row[KnownColumns.IgnoreResubmittedFile]: | ||||
|         if row[KnownColumns.IgnoreFile]: | ||||
|             continue | ||||
|         zip_absolute_path = os.path.join(args.files, row[dynamic_columns.filename]) | ||||
|         zip_absolute_path = os.path.join( | ||||
|             args.file_uploads, row[dynamic_columns.filename] | ||||
|         ) | ||||
|         if os.path.isfile(zip_absolute_path): | ||||
|             with open(zip_absolute_path, "rb") as zip_file: | ||||
|                 zip_fs = ZipFS(zip_file) | ||||
|  | @ -261,11 +399,15 @@ def extract_entries_to_temporary_folder( | |||
|     return temp_fs | ||||
| 
 | ||||
| 
 | ||||
| def anonymize_entries( | ||||
| def maybe_anonymize_entries( | ||||
|     args: AnonymizeEntriesArgs, | ||||
|     csv_contents: CsvContents, | ||||
|     temp_fs: TempFS, | ||||
| ): | ||||
|     if args.deanonymized: | ||||
|         print("Deanonymized - skipping anonymization step") | ||||
|         return | ||||
| 
 | ||||
|     def maybe_rename_file(absolute_path: str | None, canonical_filename: str): | ||||
|         if absolute_path and os.path.basename(absolute_path) != canonical_filename: | ||||
|             absolute_canonical_path = os.path.join( | ||||
|  | @ -282,7 +424,7 @@ def anonymize_entries( | |||
|             print(f"Deleted {os.path.relpath(absolute_path, temp_fs.root_path)}") | ||||
| 
 | ||||
|     for row in csv_contents: | ||||
|         if row[KnownColumns.IgnoreResubmittedFile]: | ||||
|         if row[KnownColumns.IgnoreFile]: | ||||
|             continue | ||||
| 
 | ||||
|         absolute_simfile_dir_path = os.path.join( | ||||
|  | @ -376,16 +518,17 @@ def anonymize_entries( | |||
|                     ) | ||||
| 
 | ||||
| 
 | ||||
| def save_anonymized_files( | ||||
| def maybe_save_anonymized_files( | ||||
|     args: AnonymizeEntriesArgs, | ||||
|     csv_contents: CsvContents, | ||||
|     temp_fs: TempFS, | ||||
| ): | ||||
|     if args.dry_run: | ||||
|         print("Dry run - not saving anonymized files") | ||||
|         print("Dry run - not saving files") | ||||
|         return | ||||
|     timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") | ||||
|     output_path = f"./output/anonymized-{timestamp}" | ||||
|     de = "de" if args.deanonymized else "" | ||||
|     output_path = f"{args.output}/{de}anonymized-{timestamp}" | ||||
|     shutil.copytree(temp_fs.root_path, output_path) | ||||
|     print(f"Saved to {os.path.abspath(output_path)}") | ||||
| 
 | ||||
|  | @ -396,21 +539,26 @@ def save_anonymized_files( | |||
| 
 | ||||
| 
 | ||||
| def main(argv: list[str]): | ||||
|     args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesArgs()) | ||||
|     raw_args = argparser().parse_args(argv[1:], namespace=AnonymizeEntriesRawArgs()) | ||||
|     args = process_args(raw_args) | ||||
|     assert_valid_file_paths(args) | ||||
|     alias_parts = load_alias_parts("aliasparts.csv") | ||||
|     csv_contents = load_csv_contents(args) | ||||
|     assert_known_google_forms_columns_present(csv_contents) | ||||
|     dynamic_columns = detect_dynamic_columns(csv_contents) | ||||
| 
 | ||||
|     # Generate & save CSV columns | ||||
|     csv_contents_changed = maybe_generate_aliases(args, alias_parts, csv_contents) | ||||
|     csv_contents_changed |= maybe_mark_resubmitted_entries(csv_contents) | ||||
|     csv_contents_changed |= maybe_mark_resubmitted_entries(args, csv_contents) | ||||
|     if csv_contents_changed: | ||||
|         maybe_save_generated_columns(args, csv_contents) | ||||
| 
 | ||||
|     # Generate temporary CSV columns | ||||
|     maybe_mark_unspecified_emails(args, csv_contents) | ||||
| 
 | ||||
|     temp_fs = extract_entries_to_temporary_folder(args, csv_contents, dynamic_columns) | ||||
|     anonymize_entries(args, csv_contents, temp_fs) | ||||
|     save_anonymized_files(args, csv_contents, temp_fs) | ||||
|     maybe_anonymize_entries(args, csv_contents, temp_fs) | ||||
|     maybe_save_anonymized_files(args, csv_contents, temp_fs) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue