Last active
October 28, 2024 14:20
-
-
Save andybak/97014fa0c26abd2904c7ba9d22ab9e57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import json | |
import math | |
import os | |
import re | |
EMBED_DIR = "c:\\poly_html\\poly.google.com\\view" | |
JSON_DIR = "c:\\poly_megajson\\poly.google.com\\view" | |
GLTF_DIR = "C:\\poly_updated_gltf\\poly.google.com\\view" | |
regex_pattern = re.compile( | |
# r'<meta property="og:asset".+>' | |
r"AF_initDataCallback\(\{key: 'ds:[\s\S]*?data:([\s\S]*?), sideChannel:" | |
) | |
def decompose_matrix(matrix): | |
# Extract the rotation part of the matrix (upper-left 3x3 submatrix) | |
m11, m12, m13 = matrix[0], matrix[1], matrix[2] | |
m21, m22, m23 = matrix[4], matrix[5], matrix[6] | |
m31, m32, m33 = matrix[8], matrix[9], matrix[10] | |
# Extract the scale from the rotation matrix | |
scale_x = math.sqrt(m11 ** 2 + m12 ** 2 + m13 ** 2) | |
scale_y = math.sqrt(m21 ** 2 + m22 ** 2 + m23 ** 2) | |
scale_z = math.sqrt(m31 ** 2 + m32 ** 2 + m33 ** 2) | |
# Normalize the rotation matrix to remove scale | |
if scale_x != 0: | |
m11 /= scale_x | |
m12 /= scale_x | |
m13 /= scale_x | |
if scale_y != 0: | |
m21 /= scale_y | |
m22 /= scale_y | |
m23 /= scale_y | |
if scale_z != 0: | |
m31 /= scale_z | |
m32 /= scale_z | |
m33 /= scale_z | |
trace = m11 + m22 + m33 | |
if trace > 0: | |
s = 0.5 / math.sqrt(trace + 1.0) | |
w = 0.25 / s | |
x = (m32 - m23) * s | |
y = (m13 - m31) * s | |
z = (m21 - m12) * s | |
elif (m11 > m22) and (m11 > m33): | |
s = 2.0 * math.sqrt(1.0 + m11 - m22 - m33) | |
w = (m32 - m23) / s | |
x = 0.25 * s | |
y = (m12 + m21) / s | |
z = (m13 + m31) / s | |
elif m22 > m33: | |
s = 2.0 * math.sqrt(1.0 + m22 - m11 - m33) | |
w = (m13 - m31) / s | |
x = (m12 + m21) / s | |
y = 0.25 * s | |
z = (m23 + m32) / s | |
else: | |
s = 2.0 * math.sqrt(1.0 + m33 - m11 - m22) | |
w = (m21 - m12) / s | |
x = (m13 + m31) / s | |
y = (m23 + m32) / s | |
z = 0.25 * s | |
# Extract the translation part of the matrix (last column) | |
tx = matrix[3] | |
ty = matrix[7] | |
tz = matrix[11] | |
return { | |
'rotation': [x, y, z, w], | |
'translation': [tx, ty, tz], | |
'scale': [scale_x, scale_y, scale_z] | |
} | |
def decode_file_json(file_json): | |
file_dict = {} | |
file_id = file_json[0] | |
role_id = file_json[1] | |
file_type_id = file_json[2][2] | |
file_name = file_json[2][1] | |
file_url = file_json[2][3] | |
extension = file_name.rsplit(".", 1)[-1].lower() | |
# print(f"{file_type_id} {extension} {file_name}") | |
# 1747/4: Original glTF, Original OBJ, Updated glTF, USDZ | |
# 618/6: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ | |
# 303/4: Original FBX, Original glTF, Updated glTF, USDZ | |
# 160/3: Original glTF, Updated glTF, USDZ | |
# 158/3: Original OBJ, Updated glTF, USDZ | |
# 149/4: GLB, Original glTF, Updated glTF, USDZ | |
# 116/3: Original glTF, Original OBJ, Updated glTF | |
# 50/3: Original FBX, Original glTF, Updated glTF | |
# 41/5: Original FBX, Original glTF, Original OBJ, Updated glTF, USDZ Fil | |
# 33/3: GLB, Original glTF, Updated glTF | |
# 23/5: Original FBX, Original OBJ, Updated glTF, Original Tri OBJ, USDZ Fil | |
# 20/3: Original FBX, Updated glTF, USDZ | |
# 18/2: Original FBX, Original glTF | |
# 16/5: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ Fil | |
# 14/2: Original glTF, Updated glTF | |
# 12/5: Original FBX, Original glTF, Updated glTF, Original Tri OBJ, USDZ Fil | |
# 7/3: GLB, Updated glTF, USDZ | |
# 6/2: Updated glTF, USDZ | |
# 3/2: GLB, Original glTF | |
# 2/1: Original glTF | |
# 1/4: Original FBX, Original OBJ, Updated glTF, USDZ | |
# 1/3: Original FBX, Original glTF, USDZ | |
# 1/2: Original glTF, Original OBJ | |
# 1/5: Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ | |
# Updated glTF File: 2722 | |
# Original glTF File: 2572 | |
# USDZ File: 2542 | |
# Original OBJ File: 2156 | |
# Original FBX File: 846 | |
# Original Triangulated OBJ File: 517 | |
# GLB File: 145 | |
main_role = { | |
1: "Original OBJ File", | |
2: "Tilt File", | |
4: "Unknown GLTF File", | |
6: "Original FBX File", | |
7: "Blocks File", | |
8: "USD File", | |
11: "HTML File", | |
12: "Original glTF File", | |
13: "Tour Creator Experience", | |
15: "JSON File", | |
16: "lullmodel File", | |
17: "sand File", | |
18: "GLB File", | |
19: "sand File", | |
20: "sandc File", | |
21: "pb File", | |
22: "Unknown GLTF File", | |
24: "Original Triangulated OBJ File", | |
25: "JPG (Buggy)", | |
26: "USDZ File", | |
30: "Updated glTF File", | |
32: "Editor settings pb file", | |
35: "Unknown GLTF File", | |
36: "Unknown GLB File", | |
38: "Unknown GLB File", | |
}.get(role_id, None) | |
if main_role is None: | |
print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}") | |
if role_id == 13: | |
found_tour_experience_filetype = True | |
# if role_id == 25: | |
# print(f"Skipping buggy jpg role: {asset_id}") | |
# continue # weird buggy jpg | |
if main_role is None: | |
print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}") | |
assert False | |
# print(f"{main_role} for {file_name.split('.')[-1]}") | |
# role = None | |
# if len(file_json) > 5: | |
# archive = file_json[5] # Either a zip or a usdz | |
# archive_url = archive[3] | |
# role = roles_by_url.get(archive_url, None) | |
# print(role, main_file_type_id) | |
# file_type_id=1: extension="obj" | |
# file_type_id=2: extension="tilt" | |
# file_type_id=4: extension="gltf" | |
# file_type_id=5: extension="glb" | |
# file_type_id=6: extension="fbx" | |
# file_type_id=7: extension="blocks" or extension="poly" | |
# file_type_id=8: extension="usd" | |
# file_type_id=11: extension="html" | |
# file_type_id=13: extension="experience" | |
# file_type_id=15: extension="lullmodel" | |
# file_type_id=16: extension="sand" | |
# file_type_id=17: extension="sandc" | |
# file_type_id=19: extension="usdz" | |
# file_type_id=1000: extension="jpg" | |
# file_type_id=3001: extension="zip" | |
# file_type_id=3002: extension="json" | |
# file_type_id=3012: extension="pb" | |
# file_type_id=3016: extension="pb" | |
if file_type_id == 1: | |
format_type = "OBJ" | |
assert (extension == "obj") | |
elif file_type_id == 2: | |
format_type = "TILT" | |
assert (extension == "tilt") | |
elif file_type_id == 4: | |
format_type = "GLTF" | |
assert (extension == "gltf") | |
elif file_type_id == 5: | |
format_type = "GLB" | |
assert (extension == "glb") | |
elif file_type_id == 6: | |
format_type = "FBX" | |
assert (extension == "fbx") | |
elif file_type_id == 7: | |
format_type = "BLOCKS" | |
assert (extension == "blocks" or extension == "poly") | |
elif file_type_id == 8: | |
format_type = "USD" | |
assert (extension == "usd") | |
elif file_type_id == 11: | |
format_type = "HTML" | |
assert (extension == "html") | |
elif file_type_id == 13: | |
format_type = "TOUR" | |
assert (extension == "experience") | |
elif file_type_id == 15: | |
format_type = "LULL" | |
assert (extension == "lullmodel") | |
elif file_type_id == 16: | |
format_type = "SAND" | |
assert (extension == "sand") | |
elif file_type_id == 17: | |
format_type = "SANDC" | |
assert (extension == "sandc") | |
elif file_type_id == 19: | |
format_type = "USDZ" | |
assert (extension == "usdz") | |
elif file_type_id == 1000: | |
format_type = "JPG" | |
assert (extension == "jpg") | |
elif file_type_id == 3001: | |
format_type = "ZIP" | |
assert (extension == "zip") | |
elif file_type_id == 3002: | |
format_type = "JSON" | |
assert (extension == "json") | |
elif file_type_id == 3012: | |
format_type = "PB" | |
assert (extension == "pb") | |
elif file_type_id == 3016: | |
format_type = "PB" | |
assert (extension == "pb") | |
else: | |
format_type = "UNKNOWN" | |
print(f"Unknown file type in {asset_id}: {file_type_id} for {file_name}") | |
# break | |
file_root = file_json[2][3] # 0 is an id, 1 is a filename, 2 is probably file type id? | |
sub_files = file_json[3] | |
# 4 is an int array | |
file_dict["root"] = {} | |
file_dict["root"]["id"] = file_id | |
file_dict["root"]["url"] = file_url | |
file_dict["root"]["role"] = main_role | |
file_dict["formatComplexity"] = { | |
"triangleCount": file_json[4][0], | |
# "raw": file_json[4] | |
} | |
file_dict["formatType"] = format_type | |
if len(file_json) > 5 and file_json[5][2] == 3001: | |
# 5 is a zip or usdz | |
file_dict["archive"] = {} | |
file_dict["archive"]["id"] = file_json[5][0] | |
file_dict["archive"]["filename"] = file_json[5][1] | |
# file_dict["archive"]["type"] = file_json[5][2] == 3001 if "ZIP" else "USDZ" | |
file_dict["archive"]["url"] = file_json[5][3] | |
return file_dict | |
def decode_subfile_json(subfile_json): | |
file_dict = { | |
"id": subfile_json[0], | |
"filename": subfile_json[1], | |
"url": subfile_json[3] | |
} | |
return file_dict | |
all_jsonl_path = os.path.join(JSON_DIR, "all_data.jsonl") | |
all_json_path = os.path.join(JSON_DIR, "all_data.json") | |
os.makedirs(os.path.dirname(all_json_path), exist_ok=True) | |
stats_set = set() | |
i = 0 | |
with open(all_json_path, "w", encoding="utf-8") as all_json_outfile, open(all_jsonl_path, "w", encoding="utf-8") as all_jsonl_outfile: | |
all_json_outfile.write("[") | |
for dir_name in os.listdir(EMBED_DIR): | |
embed_dir = os.path.join(EMBED_DIR, dir_name) | |
gltf_dir = os.path.join(GLTF_DIR, dir_name) | |
json_result_dict = {} | |
if os.path.isdir(embed_dir): | |
embed_path = os.path.join(embed_dir, "embed.html") | |
# find the name of the first gltf file found in gltf_dir itself | |
gltf_file_path = None | |
if os.path.exists(gltf_dir): | |
for gltf_file in os.listdir(gltf_dir): | |
if gltf_file.endswith(".gltf"): | |
gltf_file_path = os.path.join(gltf_dir, gltf_file) | |
break | |
json_file_path = os.path.join(JSON_DIR, dir_name, "data.json") | |
if not os.path.exists(embed_path): | |
print(f"No embed found for {embed_path}") | |
continue | |
# html_path = os.path.join(embed_dir, "index.html") | |
# if not os.path.exists(html_path): | |
# print(f"No html found for {embed_path}") | |
# continue | |
with open(embed_path, "r", encoding="utf-8", errors="replace") as embed: | |
contents = embed.read() | |
regex_results = regex_pattern.findall(contents, re.MULTILINE) | |
if len(regex_results) < 1: | |
print(f"No match for {embed_path}: {len(regex_results)}") | |
continue | |
try: | |
json_data = json.loads(regex_results[0]) | |
###################### | |
if len(json_data[0]) < 2: | |
print("No data: " + dir_name) | |
continue | |
asset_id = json_data[0][1][0] | |
json_result_dict["name"] = json_data[0][1][1] | |
json_result_dict["description"] = json_data[0][1][2] | |
json_result_dict["thumbnail_url"] = json_data[0][1][3] | |
viewer_data_container = json_data[0][1][4] | |
if viewer_data_container is None: | |
print("No viewer data: ", asset_id) | |
continue | |
viewer_data = [x for x in viewer_data_container if x is not None] | |
possible_tour_creator = False | |
found_tour_experience_filetype = False # for later check | |
if len(viewer_data) == 1: | |
possible_tour_creator = True | |
# print("Viewer data length 1. Tour creator?: ", asset_id) | |
# continue # Tour creator? | |
# 5 is empty | |
# some_bools = json_data[0][1][6] # len=8 [bool, False, bool, None, None, bool, bool, bool] | |
# json_result_dict["category_id"] = json_data[0][1][7] | |
category_id = json_data[0][1][7] | |
if category_id == 0: print("Uncategorized: ", dir_name) | |
json_result_dict["category"] = { | |
0: "Uncategorized", | |
1: "Art", | |
2: "Animals & Pets", | |
3: "Architecture", | |
4: "Places & Scenes", | |
5: "(Unused)", | |
6: "Food & Drink", | |
7: "Nature", | |
8: "People & Characters", | |
9: "Tools & Technology", | |
10: "Transport", | |
11: "Miscellaneous", | |
12: "Objects", | |
13: "Culture & Humanity", | |
14: "Current Events", | |
15: "Furniture & Home", | |
16: "History", | |
17: "Science", | |
18: "Sports & Fitness", | |
19: "Travel & Leisure", | |
}[category_id] | |
licence_id = json_data[0][1][8] # 0 = CC-BY-ND, 1 = CC-BY | |
if licence_id != 0 and licence_id != 1: | |
print("Unknown licence: ", asset_id, json_data[0][1][9]) | |
else: | |
json_result_dict["licence"] = "CREATIVE_COMMONS_BY_ND" if licence_id == 0 else "CREATIVE_COMMONS_BY" | |
visibility_id = json_data[0][1][9] # 2 = Unlisted, 3 = Public | |
if visibility_id != 3 and visibility_id != 2: | |
print("Unknown visibility: ", asset_id, json_data[0][1][9]) | |
else: | |
json_result_dict["visibility"] = "UNLISTED" if visibility_id == 2 else "PUBLIC" | |
# 9 = int | |
# 10 = null | |
# some_numeric_data = json_data[0][1][11] # Dunno what these are | |
camera_transform = None | |
camera_offset = None | |
if json_data[0][1][11] is not None: | |
camera_transform = json_data[0][1][11][0] | |
camera_offset = json_data[0][1][11][1] # Maybe | |
# float_data_3 = json_data[0][1][11][1] # empty or 3 floats | |
# int0 = json_data[0][1][11][2] # mostly 0 but sometimes 45 or 80 - camera fov? | |
# int1 = json_data[0][1][11][3] # mostly 0 but sometimes 1 or 3 | |
# some_int_data = json_data[0][1][12] # Dunno what these are | |
json_result_dict["authorName"] = json_data[0][1][13] | |
created_timestamp = float(json_data[0][1][14]) / 1000 / 1000 | |
dt = datetime.datetime.fromtimestamp(created_timestamp, tz=datetime.timezone.utc) | |
created_timestamp = dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") | |
json_result_dict["createTime"] = created_timestamp | |
json_result_dict["updateTime"] = created_timestamp | |
json_result_dict["authorId"] = json_data[0][1][15] # Maybe? | |
# 16 = bool | |
# 17 = null | |
# 18 = thumbnail url ? | |
json_result_dict["likes"] = json_data[0][1][19] | |
# 20 = null | |
# background color processed after formats | |
# 22 = int | |
# 23 = null | |
# 23 = [] | |
json_result_dict["tags"] = json_data[0][1][24] | |
# some_id = json_data[0][1][25] # Dunno what this is | |
# some_int_array = json_data[0][1][26] # Dunno what this is | |
# 27 | |
json_result_dict["views"] = json_data[0][1][28] # ? | |
# 29 | |
# 30 | |
# 31 | |
# 32 | |
# 33 | |
# 34 | |
formats_list = [] | |
files_array = json_data[0][1][35] | |
color_space = "LINEAR" | |
for file_json in files_array: | |
files_dict = decode_file_json(file_json) | |
resources = [] | |
for subfile in file_json[3]: | |
subfile_dict = decode_subfile_json(subfile) | |
resources.append(subfile_dict) | |
if len(resources) > 0: | |
files_dict["resources"] = resources | |
formats_list.append(files_dict) | |
if files_dict["formatType"] == "TILT": | |
color_space = "GAMMA" | |
json_result_dict["formats"] = formats_list | |
gltf_path = None | |
for format_dict in formats_list: | |
if format_dict["root"]["role"].startswith("Updated glTF"): | |
gltf_path = os.path.join(GLTF_DIR, dir_name, format_dict["root"]["url"].split("/")[-1]) | |
if gltf_path is not None and os.path.exists(gltf_path): | |
with open(gltf_path, "r", encoding="utf-8", errors="replace") as gltf: | |
gltf_contents = gltf.read() | |
gltf_data = json.loads(gltf_contents) | |
_extras = gltf_data.get("extras", {}) | |
# motionPath: FULL_ROTATION, SIDE_TO_SIDE, NONE | |
json_result_dict["GOOGLE_initial_camera_motion"] = _extras.get("GOOGLE_initial_camera_motion", None) | |
# disableShadows: true, false | |
json_result_dict["GOOGLE_lighting_rig"] = _extras.get("GOOGLE_lighting_rig", None) | |
_extensions = gltf_data["scenes"][0].get("extensions", {}) | |
# backgroundImage: {index: int} | |
# color: [float, float, float] | |
json_result_dict["GOOGLE_backgrounds"] = _extensions.get("GOOGLE_backgrounds", None) | |
# scaling_factor: float | |
json_result_dict["GOOGLE_real_world_transform"] = _extensions.get("GOOGLE_real_world_transform", None) | |
# rotation: [float, float, float, float] | |
json_result_dict["GOOGLE_scene_rotation"] = _extensions.get("GOOGLE_scene_rotation", None) | |
# light: int | |
json_result_dict["GOOGLE_lights_image_based"] = _extensions.get("GOOGLE_lights_image_based", None) | |
_scene_extras = gltf_data["scenes"][0].get("extras", {}) | |
GOOGLE_camera_index = _scene_extras.get("GOOGLE_camera_index", {}).get("nodeIndex", None) | |
camera = {} | |
cameras = gltf_data.get("cameras", []) | |
if len(cameras) > 0: | |
camera_node = None | |
for node in gltf_data["nodes"]: | |
if node.get("camera", None) is not None: | |
camera_node = node | |
break | |
if camera_node is not None: | |
if "matrix" in camera_node: | |
tr = decompose_matrix(camera_node["matrix"]) | |
cameras[0]["translation"] = tr["translation"] | |
cameras[0]["rotation"] = tr["rotation"] | |
elif "translation" in camera_node: | |
cameras[0]["translation"] = camera_node["translation"] | |
if "rotation" in camera_node: | |
cameras[0]["rotation"] = camera_node["rotation"] | |
else: | |
print("Translation without rotation", dir_name) | |
else: | |
print("Unknown camera transform type", dir_name) | |
camera.update(cameras[0]) | |
# panLimits: min: [float, float, float], max: [float, float, float] | |
# yawLimits: {min: float, max: float} | |
# pitchLimits: {min: float, max: float} | |
# distanceLimits: {min: float, max: float} | |
# mode: orbit, moveableOrbit | |
# pivot: [float, float, float] | |
camera["GOOGLE_camera_settings"] = _extensions.get("GOOGLE_camera_settings", None) | |
json_result_dict["camera"] = cameras | |
# groundColor: [float, float, float] | |
json_result_dict["GOOGLE_hemi_light"] = _scene_extras.get("GOOGLE_hemi_light", None) | |
# visualCenterPoint = GOOGLE_geometry_data.get("visualCenterPoint", None) # 3 floats | |
# stats = GOOGLE_geometry_data.get("stats", {}) | |
# radius = stats.get("radius", None) # float | |
# stdev = stats.get("stdev", None) # float | |
# centroid = stats.get("centroid", None) # 3 floats | |
json_result_dict["GOOGLE_geometry_data"] = _scene_extras.get("GOOGLE_geometry_data", None) | |
if json_data[0][1][21] is not None and len(json_data[0][1][21]) == 1: | |
json_result_dict["presentationParams"] = { | |
"orientingRotation": {"w": 1}, | |
"colorSpace": color_space, | |
"backgroundColor": json_data[0][1][21][0] | |
} | |
# 36 = int array, 37 = int array, 38 = textures? | |
###################### | |
# 2 is just the upload url | |
# 3 bool | |
# 4 bool | |
# 5 bool | |
# 6 has 12 elements | |
###################### | |
# some_other_id = json_data[0][6][0] # Probably asset id again | |
# 1 is the title again | |
# 2 is probably the same as viewer_data | |
json.dump({f"{asset_id}": json_result_dict}, all_json_outfile, indent=4) | |
all_json_outfile.write(",\n") | |
json_result_dict["assetId"] = asset_id | |
json.dump(json_result_dict, all_jsonl_outfile) | |
all_jsonl_outfile.write("\n") | |
os.makedirs(os.path.dirname(json_file_path), exist_ok=True) | |
with open(json_file_path, "w", encoding="utf-8") as json_outfile: | |
json.dump(json_result_dict, json_outfile, indent=4) | |
except json.decoder.JSONDecodeError as e: | |
print(f"Error parsing json: {embed_path}") | |
continue | |
except json.decoder.JSONDecodeError as e: | |
print(f"Unknown error for {embed_path}: {e}") | |
continue | |
all_json_outfile.write("]") | |
# if len(regex_results) != 1: | |
# print(len(regex_results), path_name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment