d2024r1 changes
This commit is contained in:
parent
275f27155d
commit
38a486131d
1 changed files with 85 additions and 41 deletions
|
@ -21,10 +21,11 @@ from fs.zipfs import ZipFS
|
|||
from pathvalidate import sanitize_filename
|
||||
import simfile
|
||||
from simfile.dir import SimfilePack, SimfileDirectory
|
||||
from simfile.notes import NoteData
|
||||
from simfile.sm import SMChart, SMSimfile
|
||||
from simfile.ssc import SSCChart, SSCSimfile
|
||||
from simfile.timing import BeatValues, BeatValue
|
||||
from simfile.types import Simfile
|
||||
from simfile.types import Simfile, Chart
|
||||
|
||||
|
||||
####################
|
||||
|
@ -38,7 +39,7 @@ class AnonymizeEntriesRawArgs:
|
|||
file_uploads: str | None
|
||||
deanonymized: bool
|
||||
dry_run: bool
|
||||
emails: str
|
||||
users: str
|
||||
output: str | None
|
||||
regenerate: bool
|
||||
seed: str
|
||||
|
@ -112,10 +113,10 @@ def argparser():
|
|||
help="skip anonymization of files, simply package them as-is",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--emails",
|
||||
"-u",
|
||||
"--users",
|
||||
type=str,
|
||||
help="limit output to files from the specified emails (comma-separated)",
|
||||
help="limit output to files from the specified users (comma-separated)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
|
@ -142,7 +143,7 @@ CsvContents = list[dict[str, str]]
|
|||
|
||||
class KnownColumns(enum.StrEnum):
|
||||
Timestamp = "Timestamp"
|
||||
EmailAddress = "Email Address"
|
||||
UserId = "Your gamer tag/alias: (e.g. dimo)"
|
||||
GeneratedAlias = "Generated Alias"
|
||||
IgnoreFile = "Ignore File"
|
||||
# Not persisted:
|
||||
|
@ -211,7 +212,7 @@ def assert_valid_file_paths(args: AnonymizeEntriesArgs):
|
|||
|
||||
|
||||
def load_csv_contents(args: AnonymizeEntriesArgs):
|
||||
with open(args.csv, "r") as csvfile:
|
||||
with open(args.csv, "r", encoding="utf-8") as csvfile:
|
||||
return list(csv.DictReader(csvfile))
|
||||
|
||||
|
||||
|
@ -232,8 +233,8 @@ def assert_known_google_forms_columns_present(csv_contents: CsvContents):
|
|||
KnownColumns.Timestamp in csv_contents[0]
|
||||
), f"Provided CSV file does not have a {repr(KnownColumns.Timestamp)} column"
|
||||
assert (
|
||||
KnownColumns.EmailAddress in csv_contents[0]
|
||||
), f"Provided CSV file does not have an {repr(KnownColumns.EmailAddress)} column"
|
||||
KnownColumns.UserId in csv_contents[0]
|
||||
), f"Provided CSV file does not have an {repr(KnownColumns.UserId)} column"
|
||||
|
||||
|
||||
def detect_dynamic_columns(csv_contents: CsvContents) -> DynamicColumns:
|
||||
|
@ -272,33 +273,31 @@ def maybe_generate_aliases(
|
|||
with open("aliases/suswords.txt", "r", encoding="utf-8") as suswords_file:
|
||||
suswords = set(line.rstrip() for line in suswords_file)
|
||||
|
||||
alias_to_email_address = {}
|
||||
alias_to_user_id = {}
|
||||
|
||||
seed = args.seed or args.csv
|
||||
|
||||
for row in csv_contents:
|
||||
rnd = Random(",".join([row[KnownColumns.EmailAddress], seed]))
|
||||
rnd = Random(",".join([row[KnownColumns.UserId], seed]))
|
||||
while True:
|
||||
random_alias = f"{rnd.choice(alias_parts[0])} {rnd.choice(alias_parts[1])}"
|
||||
if (
|
||||
random_alias in alias_to_email_address
|
||||
and alias_to_email_address[random_alias]
|
||||
!= row[KnownColumns.EmailAddress]
|
||||
random_alias in alias_to_user_id
|
||||
and alias_to_user_id[random_alias] != row[KnownColumns.UserId]
|
||||
):
|
||||
print(
|
||||
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} is already being used by {alias_to_email_address[random_alias]}"
|
||||
f"Rerolling alias for {row[KnownColumns.UserId]} because {repr(random_alias)} is already being used by {alias_to_user_id[random_alias]}"
|
||||
)
|
||||
elif random_alias in usedaliases:
|
||||
elif random_alias.lower() in usedaliases:
|
||||
print(
|
||||
f"Rerolling alias for {row[KnownColumns.EmailAddress]} because {repr(random_alias)} has already been used"
|
||||
f"Rerolling alias for {row[KnownColumns.UserId]} because {repr(random_alias)} has already been used"
|
||||
)
|
||||
elif any(
|
||||
random_part in suswords for random_part in random_alias.split(" ")
|
||||
):
|
||||
print(
|
||||
f"WARNING: alias for {row[KnownColumns.EmailAddress]} {repr(random_alias)} contains a sus word"
|
||||
f"WARNING: alias for {row[KnownColumns.UserId]} {repr(random_alias)} contains a sus word"
|
||||
)
|
||||
break
|
||||
else:
|
||||
break
|
||||
row[KnownColumns.GeneratedAlias] = random_alias
|
||||
|
@ -321,7 +320,7 @@ def maybe_mark_resubmitted_entries(
|
|||
resubmitted_total = 0
|
||||
for loop_pass in ("find", "mark"):
|
||||
for row in csv_contents:
|
||||
user = row[KnownColumns.EmailAddress]
|
||||
user = row[KnownColumns.UserId]
|
||||
timestamp = parse_timestamp(row[KnownColumns.Timestamp])
|
||||
if loop_pass == "find":
|
||||
if user in most_recent_entry_per_user:
|
||||
|
@ -343,7 +342,7 @@ def maybe_save_generated_columns(args: AnonymizeEntriesArgs, csv_contents: CsvCo
|
|||
if args.dry_run:
|
||||
print("Dry run - not writing generated columns back to CSV")
|
||||
else:
|
||||
with open(args.csv, "w", newline="") as csvfile:
|
||||
with open(args.csv, "w", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=csv_contents[0].keys())
|
||||
writer.writeheader()
|
||||
for row in csv_contents:
|
||||
|
@ -351,29 +350,29 @@ def maybe_save_generated_columns(args: AnonymizeEntriesArgs, csv_contents: CsvCo
|
|||
print("Wrote generated columns back to CSV")
|
||||
|
||||
|
||||
def maybe_mark_unspecified_emails(
|
||||
def maybe_mark_unspecified_user_ids(
|
||||
args: AnonymizeEntriesArgs, csv_contents: CsvContents
|
||||
):
|
||||
if not args.emails:
|
||||
if not args.users:
|
||||
return
|
||||
|
||||
unspecified_total = 0
|
||||
specified_total = 0
|
||||
emails = set(args.emails.split(","))
|
||||
users = set(args.users.split(","))
|
||||
|
||||
for row in csv_contents:
|
||||
if not row[KnownColumns.IgnoreFile]:
|
||||
if row[KnownColumns.EmailAddress] not in emails:
|
||||
if row[KnownColumns.UserId] not in users:
|
||||
row[KnownColumns.IgnoreFile] = "unspecified"
|
||||
unspecified_total += 1
|
||||
else:
|
||||
specified_total += 1
|
||||
|
||||
assert specified_total > 0, "No responses were found from the specified emails"
|
||||
assert specified_total > 0, "No responses were found from the specified users"
|
||||
|
||||
s = "s" if specified_total != 1 else ""
|
||||
print(
|
||||
f"Processing {specified_total} file{s} for specified emails & ignoring {unspecified_total} others"
|
||||
f"Processing {specified_total} file{s} for specified users & ignoring {unspecified_total} others"
|
||||
)
|
||||
|
||||
|
||||
|
@ -389,7 +388,7 @@ def extract_entries_to_temporary_folder(
|
|||
# Check all immediate subdirectories, followed by the root itself
|
||||
root = "/"
|
||||
contents = zip_fs.listdir(root)
|
||||
subdirs = [item for item in contents if zip_fs.isdir(item)]
|
||||
subdirs = [item for item in contents if zip_fs.isdir(item)] + [root]
|
||||
|
||||
for subdir in subdirs:
|
||||
possible_path = fs.path.join(root, subdir)
|
||||
|
@ -401,7 +400,7 @@ def extract_entries_to_temporary_folder(
|
|||
return (possible_path, possible_simfile_dir)
|
||||
|
||||
raise RuntimeError(
|
||||
"Unable to find a suitable simfile directory in the ZIP. "
|
||||
"Unable to find a suitable simfile directory in ZIP. "
|
||||
"Make sure the simfile is no more than one directory deep, "
|
||||
'e.g. contains "Simfile/simfile.ssc".'
|
||||
)
|
||||
|
@ -418,6 +417,7 @@ def extract_entries_to_temporary_folder(
|
|||
temp_fs = TempFS(identifier="dimocracy-voucher")
|
||||
|
||||
for row in csv_contents:
|
||||
try:
|
||||
if row[KnownColumns.IgnoreFile]:
|
||||
continue
|
||||
zip_absolute_path = os.path.join(
|
||||
|
@ -429,6 +429,11 @@ def extract_entries_to_temporary_folder(
|
|||
row[KnownColumns.ExtractedTo] = extract_simfile_dir(zip_fs, temp_fs)
|
||||
else:
|
||||
print("WARNING: {zip_absolute_path} not found - skipping")
|
||||
except:
|
||||
print(
|
||||
f"Exception encountered while processing row {row[KnownColumns.UserId]}"
|
||||
)
|
||||
raise
|
||||
|
||||
print(f"Extracted latest submissions to temporary directory {temp_fs.root_path}")
|
||||
return temp_fs
|
||||
|
@ -469,6 +474,39 @@ def maybe_anonymize_entries(
|
|||
print(f"Anonymized BPMs from {repr(bpm_str)} to {repr(str(bpm_values))}")
|
||||
return str(bpm_values)
|
||||
|
||||
def clean_up_difficulties(sf: Simfile):
|
||||
charts_to_remove: list[Chart] = []
|
||||
chart_with_notes = None
|
||||
|
||||
for _chart in sf.charts:
|
||||
chart: Chart = _chart # typing workaround
|
||||
|
||||
notedata = NoteData(_chart)
|
||||
if next(iter(notedata), None) is None:
|
||||
charts_to_remove.append(_chart)
|
||||
continue
|
||||
|
||||
if chart_with_notes is not None:
|
||||
raise RuntimeError(
|
||||
f"{canonical_filename} contains multiple charts with notes"
|
||||
)
|
||||
chart_with_notes = chart
|
||||
|
||||
if chart.difficulty != "Challenge":
|
||||
print(
|
||||
f"WARNING: forced difficulty of chart in {canonical_filename} to Challenge"
|
||||
)
|
||||
chart.difficulty = "Challenge"
|
||||
|
||||
if chart_with_notes is None:
|
||||
raise RuntimeError(f"{canonical_filename} has no charts with notes")
|
||||
|
||||
for chart_to_remove in charts_to_remove:
|
||||
print(
|
||||
f"WARNING: removing {chart_to_remove.difficulty} chart with no notes from {canonical_filename}"
|
||||
)
|
||||
sm.charts.remove(chart_to_remove)
|
||||
|
||||
for row in csv_contents:
|
||||
if row[KnownColumns.IgnoreFile]:
|
||||
continue
|
||||
|
@ -498,6 +536,8 @@ def maybe_anonymize_entries(
|
|||
sm.genre = ""
|
||||
sm.music = f"{canonical_filename}.ogg"
|
||||
sm.bpms = anonymize_bpms(sm.bpms)
|
||||
clean_up_difficulties(sm)
|
||||
|
||||
for _chart in sm.charts:
|
||||
sm_chart: SMChart = _chart # typing workaround
|
||||
sm_chart.description = row[KnownColumns.GeneratedAlias]
|
||||
|
@ -520,6 +560,7 @@ def maybe_anonymize_entries(
|
|||
ssc.discimage = ""
|
||||
ssc.labels = ""
|
||||
ssc.bpms = anonymize_bpms(ssc.bpms)
|
||||
clean_up_difficulties(ssc)
|
||||
for _chart in ssc.charts:
|
||||
ssc_chart: SSCChart = _chart # typing workaround
|
||||
ssc_chart.description = ""
|
||||
|
@ -537,8 +578,11 @@ def maybe_anonymize_entries(
|
|||
if dir_entry.is_file():
|
||||
if (
|
||||
dir_entry.name.endswith(".old")
|
||||
or dir_entry.name.endswith(".sm~")
|
||||
or dir_entry.name.endswith(".ssc~")
|
||||
or dir_entry.name.endswith(".txt")
|
||||
or dir_entry.name.endswith(".zip")
|
||||
or dir_entry.name == ".DS_Store"
|
||||
):
|
||||
# These are definitely safe to delete for distribution
|
||||
os.remove(dir_entry.path)
|
||||
|
@ -604,7 +648,7 @@ def main(argv: list[str]):
|
|||
maybe_save_generated_columns(args, csv_contents)
|
||||
|
||||
# Generate temporary CSV columns
|
||||
maybe_mark_unspecified_emails(args, csv_contents)
|
||||
maybe_mark_unspecified_user_ids(args, csv_contents)
|
||||
|
||||
temp_fs = extract_entries_to_temporary_folder(args, csv_contents, dynamic_columns)
|
||||
maybe_anonymize_entries(args, csv_contents, temp_fs)
|
||||
|
|
Loading…
Reference in a new issue