Created
January 11, 2023 15:25
-
-
Save phydev/2218b8c95d41115be62093a01b74bd19 to your computer and use it in GitHub Desktop.
Sample and shuffle data to provide a fake dataset with the same structure that can be used for developing methods for sensitive data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sample_and_shuffle(file, frac=0.1, save_to_file = False, **kwargs): | |
""" | |
shuffle each column in a given dataframe and draw n samples | |
this is useful for developing methods out of the server for sensitive data | |
and testing on a data file that keeps the same structure | |
:param file: path to csv file | |
:param frac: fraction of data to sample | |
:param save_to_file: save the sampled data to a csv file | |
:param kwargs: keyword arguments for pandas.read_csv | |
:return: sampled dataframe | |
""" | |
import pandas as pd | |
# read data | |
df = pd.read_csv(file, **kwargs) | |
# sample data | |
df_shuffled = df.sample(frac=frac) | |
# shuffle each column | |
for n, column in enumerate(df.columns): | |
df_shuffled[column] = df[column].sample(frac=frac).values | |
# check for data breach on all sampled records | |
breach = False | |
for record_id in df_shuffled["record_id"].values: | |
print(record_id) | |
if (df[df["record_id"]==record_id].values == df_shuffled[df_shuffled["record_id"]==record_id].values).all(): | |
print("data breach on record:", record_id) | |
breach = True | |
break | |
# save to file | |
if not breach and save_to_file: | |
df_shuffled.to_csv("fake_data.csv", sep=";", index=False) | |
print("Success!") | |
return df_shuffled |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment