Skip to content

Instantly share code, notes, and snippets.

@andybak
Created July 13, 2024 11:54
Show Gist options
  • Save andybak/788c07fefb1d6fb9dfe6d546010188e1 to your computer and use it in GitHub Desktop.
Save andybak/788c07fefb1d6fb9dfe6d546010188e1 to your computer and use it in GitHub Desktop.
# scan all subdirectories and count how many of each file type exists
import json
import os
import collections
from pprint import pprint
import matplotlib.pyplot as plt
def scan_files(root):
dirs_total = 0
files_total = 0
dir_count = collections.defaultdict(int)
file_count = collections.defaultdict(int)
json_stats = collections.defaultdict(int)
gltf_stats = collections.defaultdict(int)
# List all subdirectories in the base directory
subdirs = (os.path.join(root, d) for d in os.listdir(root) if os.path.isdir(os.path.join(root, d)))
for subdir in subdirs:
print(f"Scanning [{files_total} files {dirs_total} dirs]: {subdir} ")
# Skip the excluded directory
if os.path.abspath(subdir) == os.path.abspath("d:/Poly/_Polyscan"):
print("excluding", subdir)
continue
for path, dirs, files in os.walk(subdir):
extensions = ",".join(sorted(list(set([os.path.splitext(x)[1].lower() for x in files]))))
dir_count[extensions] += 1
for file in files:
ext = os.path.splitext(file)[1].lower()
if ext == ".jpeg": ext = ".jpg"
file_count[ext] += 1
if ext == ".json":
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as f:
try:
j = json.load(f)
except json.JSONDecodeError:
print("Error decoding json", os.path.join(path, file))
continue
# val = j.get("presentationParams", {}).get("backgroundColor", "none")
# val = "blocks" in j.get("tags", [])
val = ",".join(sorted([x.get("formatType", "MISSING") for x in j.get("formats", [])]))
json_stats[val] += 1
if ext == ".gltf":
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as f:
try:
gltf = json.load(f)
except json.JSONDecodeError:
print("Error decoding gltf", os.path.join(path, file))
continue
# val = gltf.get("asset", {}).get("version", "unknown")
val = gltf.get("asset", {}).get("generator", "unknown").split(".")[0]
gltf_stats[val] += 1
files_total += 1
dirs_total += 1
return files_total, dirs_total, file_count, gltf_stats, json_stats, dir_count
if __name__ == "__main__":
# Function to label only slices larger than a threshold
def autopct_func(pct):
return ('%1.1f%%' % pct) if pct > 5 else ''
total_files, total_dirs, file_data, gltf_data, json_data, dir_data = scan_files("d:/Poly")
print(f"Total: {total_files} files {total_dirs} dirs")
print("\n\n")
pprint(file_data)
pprint(gltf_data)
pprint(json_data)
pprint(dir_data)
# Generate and display a matplotlib pie chart of the file types
width = 1024
height = 1024
plt.figure(figsize=(width / 100, height / 100), dpi=100)
# increase font size
plt.rcParams.update({'font.size': 22})
# hide labels for small slices
plt.rcParams.update({'figure.autolayout': True})
labels = [key if value > 50 else '' for key, value in file_data.items()]
plt.pie(list(file_data.values()), labels=labels, autopct=autopct_func)
plt.show()
# Generate and display a matplotlib histogram of the file types
# width = 1024
# height = 768
# plt.figure(figsize=(width / 100, height / 100), dpi=100)
# plt.bar(range(len(data)), list(data.values()), align='center')
# plt.xticks(range(len(data)), list(data.keys()))
# plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment