Skip to content

Instantly share code, notes, and snippets.

@andybak
Last active October 28, 2024 14:20
Show Gist options
  • Save andybak/97014fa0c26abd2904c7ba9d22ab9e57 to your computer and use it in GitHub Desktop.
Save andybak/97014fa0c26abd2904c7ba9d22ab9e57 to your computer and use it in GitHub Desktop.
import datetime
import json
import math
import os
import re
EMBED_DIR = "c:\\poly_html\\poly.google.com\\view"
JSON_DIR = "c:\\poly_megajson\\poly.google.com\\view"
GLTF_DIR = "C:\\poly_updated_gltf\\poly.google.com\\view"
regex_pattern = re.compile(
# r'<meta property="og:asset".+>'
r"AF_initDataCallback\(\{key: 'ds:[\s\S]*?data:([\s\S]*?), sideChannel:"
)
def decompose_matrix(matrix):
# Extract the rotation part of the matrix (upper-left 3x3 submatrix)
m11, m12, m13 = matrix[0], matrix[1], matrix[2]
m21, m22, m23 = matrix[4], matrix[5], matrix[6]
m31, m32, m33 = matrix[8], matrix[9], matrix[10]
# Extract the scale from the rotation matrix
scale_x = math.sqrt(m11 ** 2 + m12 ** 2 + m13 ** 2)
scale_y = math.sqrt(m21 ** 2 + m22 ** 2 + m23 ** 2)
scale_z = math.sqrt(m31 ** 2 + m32 ** 2 + m33 ** 2)
# Normalize the rotation matrix to remove scale
if scale_x != 0:
m11 /= scale_x
m12 /= scale_x
m13 /= scale_x
if scale_y != 0:
m21 /= scale_y
m22 /= scale_y
m23 /= scale_y
if scale_z != 0:
m31 /= scale_z
m32 /= scale_z
m33 /= scale_z
trace = m11 + m22 + m33
if trace > 0:
s = 0.5 / math.sqrt(trace + 1.0)
w = 0.25 / s
x = (m32 - m23) * s
y = (m13 - m31) * s
z = (m21 - m12) * s
elif (m11 > m22) and (m11 > m33):
s = 2.0 * math.sqrt(1.0 + m11 - m22 - m33)
w = (m32 - m23) / s
x = 0.25 * s
y = (m12 + m21) / s
z = (m13 + m31) / s
elif m22 > m33:
s = 2.0 * math.sqrt(1.0 + m22 - m11 - m33)
w = (m13 - m31) / s
x = (m12 + m21) / s
y = 0.25 * s
z = (m23 + m32) / s
else:
s = 2.0 * math.sqrt(1.0 + m33 - m11 - m22)
w = (m21 - m12) / s
x = (m13 + m31) / s
y = (m23 + m32) / s
z = 0.25 * s
# Extract the translation part of the matrix (last column)
tx = matrix[3]
ty = matrix[7]
tz = matrix[11]
return {
'rotation': [x, y, z, w],
'translation': [tx, ty, tz],
'scale': [scale_x, scale_y, scale_z]
}
def decode_file_json(file_json):
file_dict = {}
file_id = file_json[0]
role_id = file_json[1]
file_type_id = file_json[2][2]
file_name = file_json[2][1]
file_url = file_json[2][3]
extension = file_name.rsplit(".", 1)[-1].lower()
# print(f"{file_type_id} {extension} {file_name}")
# 1747/4: Original glTF, Original OBJ, Updated glTF, USDZ
# 618/6: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ
# 303/4: Original FBX, Original glTF, Updated glTF, USDZ
# 160/3: Original glTF, Updated glTF, USDZ
# 158/3: Original OBJ, Updated glTF, USDZ
# 149/4: GLB, Original glTF, Updated glTF, USDZ
# 116/3: Original glTF, Original OBJ, Updated glTF
# 50/3: Original FBX, Original glTF, Updated glTF
# 41/5: Original FBX, Original glTF, Original OBJ, Updated glTF, USDZ Fil
# 33/3: GLB, Original glTF, Updated glTF
# 23/5: Original FBX, Original OBJ, Updated glTF, Original Tri OBJ, USDZ Fil
# 20/3: Original FBX, Updated glTF, USDZ
# 18/2: Original FBX, Original glTF
# 16/5: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ Fil
# 14/2: Original glTF, Updated glTF
# 12/5: Original FBX, Original glTF, Updated glTF, Original Tri OBJ, USDZ Fil
# 7/3: GLB, Updated glTF, USDZ
# 6/2: Updated glTF, USDZ
# 3/2: GLB, Original glTF
# 2/1: Original glTF
# 1/4: Original FBX, Original OBJ, Updated glTF, USDZ
# 1/3: Original FBX, Original glTF, USDZ
# 1/2: Original glTF, Original OBJ
# 1/5: Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ
# Updated glTF File: 2722
# Original glTF File: 2572
# USDZ File: 2542
# Original OBJ File: 2156
# Original FBX File: 846
# Original Triangulated OBJ File: 517
# GLB File: 145
main_role = {
1: "Original OBJ File",
2: "Tilt File",
4: "Unknown GLTF File",
6: "Original FBX File",
7: "Blocks File",
8: "USD File",
11: "HTML File",
12: "Original glTF File",
13: "Tour Creator Experience",
15: "JSON File",
16: "lullmodel File",
17: "sand File",
18: "GLB File",
19: "sand File",
20: "sandc File",
21: "pb File",
22: "Unknown GLTF File",
24: "Original Triangulated OBJ File",
25: "JPG (Buggy)",
26: "USDZ File",
30: "Updated glTF File",
32: "Editor settings pb file",
35: "Unknown GLTF File",
36: "Unknown GLB File",
38: "Unknown GLB File",
}.get(role_id, None)
if main_role is None:
print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}")
if role_id == 13:
found_tour_experience_filetype = True
# if role_id == 25:
# print(f"Skipping buggy jpg role: {asset_id}")
# continue # weird buggy jpg
if main_role is None:
print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}")
assert False
# print(f"{main_role} for {file_name.split('.')[-1]}")
# role = None
# if len(file_json) > 5:
# archive = file_json[5] # Either a zip or a usdz
# archive_url = archive[3]
# role = roles_by_url.get(archive_url, None)
# print(role, main_file_type_id)
# file_type_id=1: extension="obj"
# file_type_id=2: extension="tilt"
# file_type_id=4: extension="gltf"
# file_type_id=5: extension="glb"
# file_type_id=6: extension="fbx"
# file_type_id=7: extension="blocks" or extension="poly"
# file_type_id=8: extension="usd"
# file_type_id=11: extension="html"
# file_type_id=13: extension="experience"
# file_type_id=15: extension="lullmodel"
# file_type_id=16: extension="sand"
# file_type_id=17: extension="sandc"
# file_type_id=19: extension="usdz"
# file_type_id=1000: extension="jpg"
# file_type_id=3001: extension="zip"
# file_type_id=3002: extension="json"
# file_type_id=3012: extension="pb"
# file_type_id=3016: extension="pb"
if file_type_id == 1:
format_type = "OBJ"
assert (extension == "obj")
elif file_type_id == 2:
format_type = "TILT"
assert (extension == "tilt")
elif file_type_id == 4:
format_type = "GLTF"
assert (extension == "gltf")
elif file_type_id == 5:
format_type = "GLB"
assert (extension == "glb")
elif file_type_id == 6:
format_type = "FBX"
assert (extension == "fbx")
elif file_type_id == 7:
format_type = "BLOCKS"
assert (extension == "blocks" or extension == "poly")
elif file_type_id == 8:
format_type = "USD"
assert (extension == "usd")
elif file_type_id == 11:
format_type = "HTML"
assert (extension == "html")
elif file_type_id == 13:
format_type = "TOUR"
assert (extension == "experience")
elif file_type_id == 15:
format_type = "LULL"
assert (extension == "lullmodel")
elif file_type_id == 16:
format_type = "SAND"
assert (extension == "sand")
elif file_type_id == 17:
format_type = "SANDC"
assert (extension == "sandc")
elif file_type_id == 19:
format_type = "USDZ"
assert (extension == "usdz")
elif file_type_id == 1000:
format_type = "JPG"
assert (extension == "jpg")
elif file_type_id == 3001:
format_type = "ZIP"
assert (extension == "zip")
elif file_type_id == 3002:
format_type = "JSON"
assert (extension == "json")
elif file_type_id == 3012:
format_type = "PB"
assert (extension == "pb")
elif file_type_id == 3016:
format_type = "PB"
assert (extension == "pb")
else:
format_type = "UNKNOWN"
print(f"Unknown file type in {asset_id}: {file_type_id} for {file_name}")
# break
file_root = file_json[2][3] # 0 is an id, 1 is a filename, 2 is probably file type id?
sub_files = file_json[3]
# 4 is an int array
file_dict["root"] = {}
file_dict["root"]["id"] = file_id
file_dict["root"]["url"] = file_url
file_dict["root"]["role"] = main_role
file_dict["formatComplexity"] = {
"triangleCount": file_json[4][0],
# "raw": file_json[4]
}
file_dict["formatType"] = format_type
if len(file_json) > 5 and file_json[5][2] == 3001:
# 5 is a zip or usdz
file_dict["archive"] = {}
file_dict["archive"]["id"] = file_json[5][0]
file_dict["archive"]["filename"] = file_json[5][1]
# file_dict["archive"]["type"] = file_json[5][2] == 3001 if "ZIP" else "USDZ"
file_dict["archive"]["url"] = file_json[5][3]
return file_dict
def decode_subfile_json(subfile_json):
file_dict = {
"id": subfile_json[0],
"filename": subfile_json[1],
"url": subfile_json[3]
}
return file_dict
all_jsonl_path = os.path.join(JSON_DIR, "all_data.jsonl")
all_json_path = os.path.join(JSON_DIR, "all_data.json")
os.makedirs(os.path.dirname(all_json_path), exist_ok=True)
stats_set = set()
i = 0
with open(all_json_path, "w", encoding="utf-8") as all_json_outfile, open(all_jsonl_path, "w", encoding="utf-8") as all_jsonl_outfile:
all_json_outfile.write("[")
for dir_name in os.listdir(EMBED_DIR):
embed_dir = os.path.join(EMBED_DIR, dir_name)
gltf_dir = os.path.join(GLTF_DIR, dir_name)
json_result_dict = {}
if os.path.isdir(embed_dir):
embed_path = os.path.join(embed_dir, "embed.html")
# find the name of the first gltf file found in gltf_dir itself
gltf_file_path = None
if os.path.exists(gltf_dir):
for gltf_file in os.listdir(gltf_dir):
if gltf_file.endswith(".gltf"):
gltf_file_path = os.path.join(gltf_dir, gltf_file)
break
json_file_path = os.path.join(JSON_DIR, dir_name, "data.json")
if not os.path.exists(embed_path):
print(f"No embed found for {embed_path}")
continue
# html_path = os.path.join(embed_dir, "index.html")
# if not os.path.exists(html_path):
# print(f"No html found for {embed_path}")
# continue
with open(embed_path, "r", encoding="utf-8", errors="replace") as embed:
contents = embed.read()
regex_results = regex_pattern.findall(contents, re.MULTILINE)
if len(regex_results) < 1:
print(f"No match for {embed_path}: {len(regex_results)}")
continue
try:
json_data = json.loads(regex_results[0])
######################
if len(json_data[0]) < 2:
print("No data: " + dir_name)
continue
asset_id = json_data[0][1][0]
json_result_dict["name"] = json_data[0][1][1]
json_result_dict["description"] = json_data[0][1][2]
json_result_dict["thumbnail_url"] = json_data[0][1][3]
viewer_data_container = json_data[0][1][4]
if viewer_data_container is None:
print("No viewer data: ", asset_id)
continue
viewer_data = [x for x in viewer_data_container if x is not None]
possible_tour_creator = False
found_tour_experience_filetype = False # for later check
if len(viewer_data) == 1:
possible_tour_creator = True
# print("Viewer data length 1. Tour creator?: ", asset_id)
# continue # Tour creator?
# 5 is empty
# some_bools = json_data[0][1][6] # len=8 [bool, False, bool, None, None, bool, bool, bool]
# json_result_dict["category_id"] = json_data[0][1][7]
category_id = json_data[0][1][7]
if category_id == 0: print("Uncategorized: ", dir_name)
json_result_dict["category"] = {
0: "Uncategorized",
1: "Art",
2: "Animals & Pets",
3: "Architecture",
4: "Places & Scenes",
5: "(Unused)",
6: "Food & Drink",
7: "Nature",
8: "People & Characters",
9: "Tools & Technology",
10: "Transport",
11: "Miscellaneous",
12: "Objects",
13: "Culture & Humanity",
14: "Current Events",
15: "Furniture & Home",
16: "History",
17: "Science",
18: "Sports & Fitness",
19: "Travel & Leisure",
}[category_id]
licence_id = json_data[0][1][8] # 0 = CC-BY-ND, 1 = CC-BY
if licence_id != 0 and licence_id != 1:
print("Unknown licence: ", asset_id, json_data[0][1][9])
else:
json_result_dict["licence"] = "CREATIVE_COMMONS_BY_ND" if licence_id == 0 else "CREATIVE_COMMONS_BY"
visibility_id = json_data[0][1][9] # 2 = Unlisted, 3 = Public
if visibility_id != 3 and visibility_id != 2:
print("Unknown visibility: ", asset_id, json_data[0][1][9])
else:
json_result_dict["visibility"] = "UNLISTED" if visibility_id == 2 else "PUBLIC"
# 9 = int
# 10 = null
# some_numeric_data = json_data[0][1][11] # Dunno what these are
camera_transform = None
camera_offset = None
if json_data[0][1][11] is not None:
camera_transform = json_data[0][1][11][0]
camera_offset = json_data[0][1][11][1] # Maybe
# float_data_3 = json_data[0][1][11][1] # empty or 3 floats
# int0 = json_data[0][1][11][2] # mostly 0 but sometimes 45 or 80 - camera fov?
# int1 = json_data[0][1][11][3] # mostly 0 but sometimes 1 or 3
# some_int_data = json_data[0][1][12] # Dunno what these are
json_result_dict["authorName"] = json_data[0][1][13]
created_timestamp = float(json_data[0][1][14]) / 1000 / 1000
dt = datetime.datetime.fromtimestamp(created_timestamp, tz=datetime.timezone.utc)
created_timestamp = dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
json_result_dict["createTime"] = created_timestamp
json_result_dict["updateTime"] = created_timestamp
json_result_dict["authorId"] = json_data[0][1][15] # Maybe?
# 16 = bool
# 17 = null
# 18 = thumbnail url ?
json_result_dict["likes"] = json_data[0][1][19]
# 20 = null
# background color processed after formats
# 22 = int
# 23 = null
# 23 = []
json_result_dict["tags"] = json_data[0][1][24]
# some_id = json_data[0][1][25] # Dunno what this is
# some_int_array = json_data[0][1][26] # Dunno what this is
# 27
json_result_dict["views"] = json_data[0][1][28] # ?
# 29
# 30
# 31
# 32
# 33
# 34
formats_list = []
files_array = json_data[0][1][35]
color_space = "LINEAR"
for file_json in files_array:
files_dict = decode_file_json(file_json)
resources = []
for subfile in file_json[3]:
subfile_dict = decode_subfile_json(subfile)
resources.append(subfile_dict)
if len(resources) > 0:
files_dict["resources"] = resources
formats_list.append(files_dict)
if files_dict["formatType"] == "TILT":
color_space = "GAMMA"
json_result_dict["formats"] = formats_list
gltf_path = None
for format_dict in formats_list:
if format_dict["root"]["role"].startswith("Updated glTF"):
gltf_path = os.path.join(GLTF_DIR, dir_name, format_dict["root"]["url"].split("/")[-1])
if gltf_path is not None and os.path.exists(gltf_path):
with open(gltf_path, "r", encoding="utf-8", errors="replace") as gltf:
gltf_contents = gltf.read()
gltf_data = json.loads(gltf_contents)
_extras = gltf_data.get("extras", {})
# motionPath: FULL_ROTATION, SIDE_TO_SIDE, NONE
json_result_dict["GOOGLE_initial_camera_motion"] = _extras.get("GOOGLE_initial_camera_motion", None)
# disableShadows: true, false
json_result_dict["GOOGLE_lighting_rig"] = _extras.get("GOOGLE_lighting_rig", None)
_extensions = gltf_data["scenes"][0].get("extensions", {})
# backgroundImage: {index: int}
# color: [float, float, float]
json_result_dict["GOOGLE_backgrounds"] = _extensions.get("GOOGLE_backgrounds", None)
# scaling_factor: float
json_result_dict["GOOGLE_real_world_transform"] = _extensions.get("GOOGLE_real_world_transform", None)
# rotation: [float, float, float, float]
json_result_dict["GOOGLE_scene_rotation"] = _extensions.get("GOOGLE_scene_rotation", None)
# light: int
json_result_dict["GOOGLE_lights_image_based"] = _extensions.get("GOOGLE_lights_image_based", None)
_scene_extras = gltf_data["scenes"][0].get("extras", {})
GOOGLE_camera_index = _scene_extras.get("GOOGLE_camera_index", {}).get("nodeIndex", None)
camera = {}
cameras = gltf_data.get("cameras", [])
if len(cameras) > 0:
camera_node = None
for node in gltf_data["nodes"]:
if node.get("camera", None) is not None:
camera_node = node
break
if camera_node is not None:
if "matrix" in camera_node:
tr = decompose_matrix(camera_node["matrix"])
cameras[0]["translation"] = tr["translation"]
cameras[0]["rotation"] = tr["rotation"]
elif "translation" in camera_node:
cameras[0]["translation"] = camera_node["translation"]
if "rotation" in camera_node:
cameras[0]["rotation"] = camera_node["rotation"]
else:
print("Translation without rotation", dir_name)
else:
print("Unknown camera transform type", dir_name)
camera.update(cameras[0])
# panLimits: min: [float, float, float], max: [float, float, float]
# yawLimits: {min: float, max: float}
# pitchLimits: {min: float, max: float}
# distanceLimits: {min: float, max: float}
# mode: orbit, moveableOrbit
# pivot: [float, float, float]
camera["GOOGLE_camera_settings"] = _extensions.get("GOOGLE_camera_settings", None)
json_result_dict["camera"] = cameras
# groundColor: [float, float, float]
json_result_dict["GOOGLE_hemi_light"] = _scene_extras.get("GOOGLE_hemi_light", None)
# visualCenterPoint = GOOGLE_geometry_data.get("visualCenterPoint", None) # 3 floats
# stats = GOOGLE_geometry_data.get("stats", {})
# radius = stats.get("radius", None) # float
# stdev = stats.get("stdev", None) # float
# centroid = stats.get("centroid", None) # 3 floats
json_result_dict["GOOGLE_geometry_data"] = _scene_extras.get("GOOGLE_geometry_data", None)
if json_data[0][1][21] is not None and len(json_data[0][1][21]) == 1:
json_result_dict["presentationParams"] = {
"orientingRotation": {"w": 1},
"colorSpace": color_space,
"backgroundColor": json_data[0][1][21][0]
}
# 36 = int array, 37 = int array, 38 = textures?
######################
# 2 is just the upload url
# 3 bool
# 4 bool
# 5 bool
# 6 has 12 elements
######################
# some_other_id = json_data[0][6][0] # Probably asset id again
# 1 is the title again
# 2 is probably the same as viewer_data
json.dump({f"{asset_id}": json_result_dict}, all_json_outfile, indent=4)
all_json_outfile.write(",\n")
json_result_dict["assetId"] = asset_id
json.dump(json_result_dict, all_jsonl_outfile)
all_jsonl_outfile.write("\n")
os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
with open(json_file_path, "w", encoding="utf-8") as json_outfile:
json.dump(json_result_dict, json_outfile, indent=4)
except json.decoder.JSONDecodeError as e:
print(f"Error parsing json: {embed_path}")
continue
except json.decoder.JSONDecodeError as e:
print(f"Unknown error for {embed_path}: {e}")
continue
all_json_outfile.write("]")
# if len(regex_results) != 1:
# print(len(regex_results), path_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment