andybak/scan_embed_to_json.py

## scan_embed_to_json.py
import datetime
import json
import math
import os
import re

EMBED_DIR = "c:\\poly_html\\poly.google.com\\view"
JSON_DIR = "c:\\poly_megajson\\poly.google.com\\view"
GLTF_DIR = "C:\\poly_updated_gltf\\poly.google.com\\view"

regex_pattern = re.compile(
    # r'<meta property="og:asset".+>'
    r"AF_initDataCallback\(\{key: 'ds:[\s\S]*?data:([\s\S]*?), sideChannel:"
)


def decompose_matrix(matrix):
    # Extract the rotation part of the matrix (upper-left 3x3 submatrix)
    m11, m12, m13 = matrix[0], matrix[1], matrix[2]
    m21, m22, m23 = matrix[4], matrix[5], matrix[6]
    m31, m32, m33 = matrix[8], matrix[9], matrix[10]

    # Extract the scale from the rotation matrix
    scale_x = math.sqrt(m11 ** 2 + m12 ** 2 + m13 ** 2)
    scale_y = math.sqrt(m21 ** 2 + m22 ** 2 + m23 ** 2)
    scale_z = math.sqrt(m31 ** 2 + m32 ** 2 + m33 ** 2)

    # Normalize the rotation matrix to remove scale
    if scale_x != 0:
        m11 /= scale_x
        m12 /= scale_x
        m13 /= scale_x

    if scale_y != 0:
        m21 /= scale_y
        m22 /= scale_y
        m23 /= scale_y

    if scale_z != 0:
        m31 /= scale_z
        m32 /= scale_z
        m33 /= scale_z

    trace = m11 + m22 + m33

    if trace > 0:
        s = 0.5 / math.sqrt(trace + 1.0)
        w = 0.25 / s
        x = (m32 - m23) * s
        y = (m13 - m31) * s
        z = (m21 - m12) * s
    elif (m11 > m22) and (m11 > m33):
        s = 2.0 * math.sqrt(1.0 + m11 - m22 - m33)
        w = (m32 - m23) / s
        x = 0.25 * s
        y = (m12 + m21) / s
        z = (m13 + m31) / s
    elif m22 > m33:
        s = 2.0 * math.sqrt(1.0 + m22 - m11 - m33)
        w = (m13 - m31) / s
        x = (m12 + m21) / s
        y = 0.25 * s
        z = (m23 + m32) / s
    else:
        s = 2.0 * math.sqrt(1.0 + m33 - m11 - m22)
        w = (m21 - m12) / s
        x = (m13 + m31) / s
        y = (m23 + m32) / s
        z = 0.25 * s

    # Extract the translation part of the matrix (last column)
    tx = matrix[3]
    ty = matrix[7]
    tz = matrix[11]

    return {
        'rotation': [x, y, z, w],
        'translation': [tx, ty, tz],
        'scale': [scale_x, scale_y, scale_z]
    }


def decode_file_json(file_json):

    file_dict = {}
    file_id = file_json[0]
    role_id = file_json[1]
    file_type_id = file_json[2][2]
    file_name = file_json[2][1]
    file_url = file_json[2][3]
    extension = file_name.rsplit(".", 1)[-1].lower()
    # print(f"{file_type_id} {extension} {file_name}")

    # 1747/4: Original glTF, Original OBJ, Updated glTF, USDZ
    # 618/6: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ
    # 303/4: Original FBX, Original glTF, Updated glTF, USDZ
    # 160/3: Original glTF, Updated glTF, USDZ
    # 158/3: Original OBJ, Updated glTF, USDZ
    # 149/4: GLB, Original glTF, Updated glTF, USDZ
    # 116/3: Original glTF, Original OBJ, Updated glTF
    # 50/3: Original FBX, Original glTF, Updated glTF
    # 41/5: Original FBX, Original glTF, Original OBJ, Updated glTF, USDZ Fil
    # 33/3: GLB, Original glTF, Updated glTF
    # 23/5: Original FBX, Original OBJ, Updated glTF, Original Tri OBJ, USDZ Fil
    # 20/3: Original FBX, Updated glTF, USDZ
    # 18/2: Original FBX, Original glTF
    # 16/5: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ Fil
    # 14/2: Original glTF, Updated glTF
    # 12/5: Original FBX, Original glTF, Updated glTF, Original Tri OBJ, USDZ Fil
    # 7/3: GLB, Updated glTF, USDZ
    # 6/2: Updated glTF, USDZ
    # 3/2: GLB, Original glTF
    # 2/1: Original glTF
    # 1/4: Original FBX, Original OBJ, Updated glTF, USDZ
    # 1/3: Original FBX, Original glTF, USDZ
    # 1/2: Original glTF, Original OBJ
    # 1/5: Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ


    # Updated glTF File: 2722
    # Original glTF File: 2572
    # USDZ File: 2542
    # Original OBJ File: 2156
    # Original FBX File: 846
    # Original Triangulated OBJ File: 517
    # GLB File: 145
    main_role = {
        1: "Original OBJ File",
        2: "Tilt File",
        4: "Unknown GLTF File",
        6: "Original FBX File",
        7: "Blocks File",
        8: "USD File",
        11: "HTML File",
        12: "Original glTF File",
        13: "Tour Creator Experience",
        15: "JSON File",
        16: "lullmodel File",
        17: "sand File",
        18: "GLB File",
        19: "sand File",
        20: "sandc File",
        21: "pb File",
        22: "Unknown GLTF File",
        24: "Original Triangulated OBJ File",
        25: "JPG (Buggy)",
        26: "USDZ File",
        30: "Updated glTF File",
        32: "Editor settings pb file",
        35: "Unknown GLTF File",
        36: "Unknown GLB File",
        38: "Unknown GLB File",
    }.get(role_id, None)

    if main_role is None:
        print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}")

    if role_id == 13:
        found_tour_experience_filetype = True
    # if role_id == 25:
    #     print(f"Skipping buggy jpg role: {asset_id}")
    #     continue  # weird buggy jpg
    if main_role is None:
        print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}")
        assert False
    # print(f"{main_role} for {file_name.split('.')[-1]}")
    # role = None
    # if len(file_json) > 5:
    #     archive = file_json[5]  # Either a zip or a usdz
    #     archive_url = archive[3]
    #     role = roles_by_url.get(archive_url, None)
    #     print(role, main_file_type_id)

    # file_type_id=1: extension="obj"
    # file_type_id=2: extension="tilt"
    # file_type_id=4: extension="gltf"
    # file_type_id=5: extension="glb"
    # file_type_id=6: extension="fbx"
    # file_type_id=7: extension="blocks" or extension="poly"
    # file_type_id=8: extension="usd"
    # file_type_id=11: extension="html"
    # file_type_id=13: extension="experience"
    # file_type_id=15: extension="lullmodel"
    # file_type_id=16: extension="sand"
    # file_type_id=17: extension="sandc"
    # file_type_id=19: extension="usdz"
    # file_type_id=1000: extension="jpg"
    # file_type_id=3001: extension="zip"
    # file_type_id=3002: extension="json"
    # file_type_id=3012: extension="pb"
    # file_type_id=3016: extension="pb"

    if file_type_id == 1:
        format_type = "OBJ"
        assert (extension == "obj")
    elif file_type_id == 2:
        format_type = "TILT"
        assert (extension == "tilt")
    elif file_type_id == 4:
        format_type = "GLTF"
        assert (extension == "gltf")
    elif file_type_id == 5:
        format_type = "GLB"
        assert (extension == "glb")
    elif file_type_id == 6:
        format_type = "FBX"
        assert (extension == "fbx")
    elif file_type_id == 7:
        format_type = "BLOCKS"
        assert (extension == "blocks" or extension == "poly")
    elif file_type_id == 8:
        format_type = "USD"
        assert (extension == "usd")
    elif file_type_id == 11:
        format_type = "HTML"
        assert (extension == "html")
    elif file_type_id == 13:
        format_type = "TOUR"
        assert (extension == "experience")
    elif file_type_id == 15:
        format_type = "LULL"
        assert (extension == "lullmodel")
    elif file_type_id == 16:
        format_type = "SAND"
        assert (extension == "sand")
    elif file_type_id == 17:
        format_type = "SANDC"
        assert (extension == "sandc")
    elif file_type_id == 19:
        format_type = "USDZ"
        assert (extension == "usdz")
    elif file_type_id == 1000:
        format_type = "JPG"
        assert (extension == "jpg")
    elif file_type_id == 3001:
        format_type = "ZIP"
        assert (extension == "zip")
    elif file_type_id == 3002:
        format_type = "JSON"
        assert (extension == "json")
    elif file_type_id == 3012:
        format_type = "PB"
        assert (extension == "pb")
    elif file_type_id == 3016:
        format_type = "PB"
        assert (extension == "pb")
    else:
        format_type = "UNKNOWN"
        print(f"Unknown file type in {asset_id}: {file_type_id} for {file_name}")
        # break

    file_root = file_json[2][3]  # 0 is an id, 1 is a filename, 2 is probably file type id?
    sub_files = file_json[3]
    # 4 is an int array
    file_dict["root"] = {}
    file_dict["root"]["id"] = file_id
    file_dict["root"]["url"] = file_url
    file_dict["root"]["role"] = main_role
    file_dict["formatComplexity"] = {
        "triangleCount": file_json[4][0],
        # "raw": file_json[4]
    }
    file_dict["formatType"] = format_type
    if len(file_json) > 5 and file_json[5][2] == 3001:
        # 5 is a zip or usdz
        file_dict["archive"] = {}
        file_dict["archive"]["id"] = file_json[5][0]
        file_dict["archive"]["filename"] = file_json[5][1]
        # file_dict["archive"]["type"] = file_json[5][2] == 3001 if "ZIP" else "USDZ"
        file_dict["archive"]["url"] = file_json[5][3]

    return file_dict


def decode_subfile_json(subfile_json):
    file_dict = {
        "id": subfile_json[0],
        "filename": subfile_json[1],
        "url": subfile_json[3]
    }
    return file_dict


all_jsonl_path = os.path.join(JSON_DIR, "all_data.jsonl")
all_json_path = os.path.join(JSON_DIR, "all_data.json")
os.makedirs(os.path.dirname(all_json_path), exist_ok=True)

stats_set = set()
i = 0

with open(all_json_path, "w", encoding="utf-8") as all_json_outfile, open(all_jsonl_path, "w", encoding="utf-8") as all_jsonl_outfile:

    all_json_outfile.write("[")

    for dir_name in os.listdir(EMBED_DIR):

        embed_dir = os.path.join(EMBED_DIR, dir_name)
        gltf_dir = os.path.join(GLTF_DIR, dir_name)

        json_result_dict = {}
        if os.path.isdir(embed_dir):
            embed_path = os.path.join(embed_dir, "embed.html")
            # find the name of the first gltf file found in gltf_dir itself
            gltf_file_path = None
            if os.path.exists(gltf_dir):
                for gltf_file in os.listdir(gltf_dir):
                    if gltf_file.endswith(".gltf"):
                        gltf_file_path = os.path.join(gltf_dir, gltf_file)
                        break

            json_file_path = os.path.join(JSON_DIR, dir_name, "data.json")
            if not os.path.exists(embed_path):
                print(f"No embed found for {embed_path}")
                continue

            # html_path = os.path.join(embed_dir, "index.html")
            # if not os.path.exists(html_path):
            #     print(f"No html found for {embed_path}")
            #     continue

            with open(embed_path, "r", encoding="utf-8", errors="replace") as embed:
                contents = embed.read()
            regex_results = regex_pattern.findall(contents, re.MULTILINE)

            if len(regex_results) < 1:
                print(f"No match for {embed_path}: {len(regex_results)}")
                continue

            try:
                json_data = json.loads(regex_results[0])

                ######################

                if len(json_data[0]) < 2:
                    print("No data: " + dir_name)
                    continue

                asset_id = json_data[0][1][0]

                json_result_dict["name"] = json_data[0][1][1]
                json_result_dict["description"] = json_data[0][1][2]
                json_result_dict["thumbnail_url"] = json_data[0][1][3]

                viewer_data_container = json_data[0][1][4]
                if viewer_data_container is None:
                    print("No viewer data: ", asset_id)
                    continue
                viewer_data = [x for x in viewer_data_container if x is not None]
                possible_tour_creator = False
                found_tour_experience_filetype = False  # for later check
                if len(viewer_data) == 1:
                    possible_tour_creator = True
                    # print("Viewer data length 1. Tour creator?: ", asset_id)
                    # continue  # Tour creator?

                # 5 is empty
                # some_bools = json_data[0][1][6]  # len=8 [bool, False, bool, None, None, bool, bool, bool]
                # json_result_dict["category_id"] = json_data[0][1][7]
                category_id = json_data[0][1][7]
                if category_id == 0: print("Uncategorized: ", dir_name)
                json_result_dict["category"] = {
                    0: "Uncategorized",
                    1: "Art",
                    2: "Animals & Pets",
                    3: "Architecture",
                    4: "Places & Scenes",
                    5: "(Unused)",
                    6: "Food & Drink",
                    7: "Nature",
                    8: "People & Characters",
                    9: "Tools & Technology",
                    10: "Transport",
                    11: "Miscellaneous",
                    12: "Objects",
                    13: "Culture & Humanity",
                    14: "Current Events",
                    15: "Furniture & Home",
                    16: "History",
                    17: "Science",
                    18: "Sports & Fitness",
                    19: "Travel & Leisure",
                }[category_id]
                licence_id = json_data[0][1][8]  # 0 = CC-BY-ND, 1 = CC-BY
                if licence_id != 0 and licence_id != 1:
                    print("Unknown licence: ", asset_id, json_data[0][1][9])
                else:
                    json_result_dict["licence"] = "CREATIVE_COMMONS_BY_ND" if licence_id == 0 else "CREATIVE_COMMONS_BY"

                visibility_id = json_data[0][1][9]  # 2 = Unlisted, 3 = Public
                if visibility_id != 3 and visibility_id != 2:
                    print("Unknown visibility: ", asset_id, json_data[0][1][9])
                else:
                    json_result_dict["visibility"] = "UNLISTED" if visibility_id == 2 else "PUBLIC"
                # 9 = int
                # 10 = null
                # some_numeric_data = json_data[0][1][11]  # Dunno what these are
                camera_transform = None
                camera_offset = None
                if json_data[0][1][11] is not None:
                    camera_transform = json_data[0][1][11][0]
                    camera_offset = json_data[0][1][11][1]  # Maybe
                #     float_data_3 = json_data[0][1][11][1]  # empty or 3 floats
                #     int0 = json_data[0][1][11][2]  # mostly 0 but sometimes 45 or 80 - camera fov?
                #     int1 = json_data[0][1][11][3]  # mostly 0 but sometimes 1 or 3
                # some_int_data = json_data[0][1][12]  # Dunno what these are
                json_result_dict["authorName"] = json_data[0][1][13]
                created_timestamp = float(json_data[0][1][14]) / 1000 / 1000
                dt = datetime.datetime.fromtimestamp(created_timestamp, tz=datetime.timezone.utc)
                created_timestamp = dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
                json_result_dict["createTime"] = created_timestamp
                json_result_dict["updateTime"] = created_timestamp

                json_result_dict["authorId"] = json_data[0][1][15]  # Maybe?
                # 16 = bool
                # 17 = null
                # 18 = thumbnail url ?
                json_result_dict["likes"] = json_data[0][1][19]
                # 20 = null
                # background color processed after formats
                # 22 = int
                # 23 = null
                # 23 = []
                json_result_dict["tags"] = json_data[0][1][24]
                # some_id = json_data[0][1][25]  # Dunno what this is
                # some_int_array = json_data[0][1][26]  # Dunno what this is
                # 27
                json_result_dict["views"] = json_data[0][1][28]  # ?
                # 29
                # 30
                # 31
                # 32
                # 33
                # 34
                formats_list = []
                files_array = json_data[0][1][35]

                color_space = "LINEAR"
                for file_json in files_array:
                    files_dict = decode_file_json(file_json)
                    resources = []
                    for subfile in file_json[3]:
                        subfile_dict = decode_subfile_json(subfile)
                        resources.append(subfile_dict)
                    if len(resources) > 0:
                        files_dict["resources"] = resources
                    formats_list.append(files_dict)
                    if files_dict["formatType"] == "TILT":
                        color_space = "GAMMA"

                json_result_dict["formats"] = formats_list

                gltf_path = None
                for format_dict in formats_list:
                    if format_dict["root"]["role"].startswith("Updated glTF"):
                        gltf_path = os.path.join(GLTF_DIR, dir_name, format_dict["root"]["url"].split("/")[-1])

                if gltf_path is not None and os.path.exists(gltf_path):
                    with open(gltf_path, "r", encoding="utf-8", errors="replace") as gltf:
                        gltf_contents = gltf.read()
                        gltf_data = json.loads(gltf_contents)

                        _extras = gltf_data.get("extras", {})

                        # motionPath:  FULL_ROTATION, SIDE_TO_SIDE, NONE
                        json_result_dict["GOOGLE_initial_camera_motion"] = _extras.get("GOOGLE_initial_camera_motion", None)

                        # disableShadows: true, false
                        json_result_dict["GOOGLE_lighting_rig"] = _extras.get("GOOGLE_lighting_rig", None)

                        _extensions = gltf_data["scenes"][0].get("extensions", {})

                        # backgroundImage: {index: int}
                        # color: [float, float, float]
                        json_result_dict["GOOGLE_backgrounds"] = _extensions.get("GOOGLE_backgrounds", None)

                        # scaling_factor: float
                        json_result_dict["GOOGLE_real_world_transform"] = _extensions.get("GOOGLE_real_world_transform", None)

                        # rotation: [float, float, float, float]
                        json_result_dict["GOOGLE_scene_rotation"] = _extensions.get("GOOGLE_scene_rotation", None)

                        # light: int
                        json_result_dict["GOOGLE_lights_image_based"] = _extensions.get("GOOGLE_lights_image_based", None)

                        _scene_extras = gltf_data["scenes"][0].get("extras", {})
                        GOOGLE_camera_index = _scene_extras.get("GOOGLE_camera_index", {}).get("nodeIndex", None)
                        camera = {}
                        cameras = gltf_data.get("cameras", [])
                        if len(cameras) > 0:
                            camera_node = None
                            for node in gltf_data["nodes"]:
                                if node.get("camera", None) is not None:
                                    camera_node = node
                                    break
                            if camera_node is not None:
                                if "matrix" in camera_node:
                                    tr = decompose_matrix(camera_node["matrix"])
                                    cameras[0]["translation"] = tr["translation"]
                                    cameras[0]["rotation"] = tr["rotation"]
                                elif "translation" in camera_node:
                                    cameras[0]["translation"] = camera_node["translation"]
                                    if "rotation" in camera_node:
                                        cameras[0]["rotation"] = camera_node["rotation"]
                                    else:
                                        print("Translation without rotation", dir_name)
                                else:
                                    print("Unknown camera transform type", dir_name)
                            camera.update(cameras[0])

                            # panLimits: min: [float, float, float], max: [float, float, float]
                            # yawLimits: {min: float, max: float}
                            # pitchLimits: {min: float, max: float}
                            # distanceLimits: {min: float, max: float}
                            # mode: orbit, moveableOrbit
                            # pivot: [float, float, float]
                            camera["GOOGLE_camera_settings"] = _extensions.get("GOOGLE_camera_settings", None)

                        json_result_dict["camera"] = cameras

                        # groundColor: [float, float, float]
                        json_result_dict["GOOGLE_hemi_light"] = _scene_extras.get("GOOGLE_hemi_light", None)

                        # visualCenterPoint = GOOGLE_geometry_data.get("visualCenterPoint", None)  # 3 floats
                        # stats = GOOGLE_geometry_data.get("stats", {})
                        # radius = stats.get("radius", None)  # float
                        # stdev = stats.get("stdev", None)  # float
                        # centroid = stats.get("centroid", None)  # 3 floats
                        json_result_dict["GOOGLE_geometry_data"] = _scene_extras.get("GOOGLE_geometry_data", None)


                if json_data[0][1][21] is not None and len(json_data[0][1][21]) == 1:
                    json_result_dict["presentationParams"] = {
                        "orientingRotation": {"w": 1},
                        "colorSpace": color_space,
                        "backgroundColor": json_data[0][1][21][0]
                    }


                # 36 = int array, 37 = int array, 38 = textures?

                ######################

                # 2 is just the upload url
                # 3 bool
                # 4 bool
                # 5 bool
                # 6 has 12 elements

                ######################

                # some_other_id = json_data[0][6][0]  # Probably asset id again
                # 1 is the title again
                # 2 is probably the same as viewer_data

                json.dump({f"{asset_id}": json_result_dict}, all_json_outfile, indent=4)
                all_json_outfile.write(",\n")

                json_result_dict["assetId"] = asset_id
                json.dump(json_result_dict, all_jsonl_outfile)
                all_jsonl_outfile.write("\n")

                os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
                with open(json_file_path, "w", encoding="utf-8") as json_outfile:
                    json.dump(json_result_dict, json_outfile, indent=4)

            except json.decoder.JSONDecodeError as e:
                print(f"Error parsing json: {embed_path}")
                continue

            except json.decoder.JSONDecodeError as e:
                print(f"Unknown error for {embed_path}: {e}")
                continue

    all_json_outfile.write("]")

                # if len(regex_results) != 1:
                #     print(len(regex_results), path_name)
	import datetime
	import json
	import math
	import os
	import re

	EMBED_DIR = "c:\\poly_html\\poly.google.com\\view"
	JSON_DIR = "c:\\poly_megajson\\poly.google.com\\view"
	GLTF_DIR = "C:\\poly_updated_gltf\\poly.google.com\\view"

	regex_pattern = re.compile(
	# r'<meta property="og:asset".+>'
	r"AF_initDataCallback\(\{key: 'ds:[\s\S]?data:([\s\S]?), sideChannel:"
	)


	def decompose_matrix(matrix):
	# Extract the rotation part of the matrix (upper-left 3x3 submatrix)
	m11, m12, m13 = matrix[0], matrix[1], matrix[2]
	m21, m22, m23 = matrix[4], matrix[5], matrix[6]
	m31, m32, m33 = matrix[8], matrix[9], matrix[10]

	# Extract the scale from the rotation matrix
	scale_x = math.sqrt(m11 2 + m12 2 + m13 ** 2)
	scale_y = math.sqrt(m21 2 + m22 2 + m23 ** 2)
	scale_z = math.sqrt(m31 2 + m32 2 + m33 ** 2)

	# Normalize the rotation matrix to remove scale
	if scale_x != 0:
	m11 /= scale_x
	m12 /= scale_x
	m13 /= scale_x

	if scale_y != 0:
	m21 /= scale_y
	m22 /= scale_y
	m23 /= scale_y

	if scale_z != 0:
	m31 /= scale_z
	m32 /= scale_z
	m33 /= scale_z

	trace = m11 + m22 + m33

	if trace > 0:
	s = 0.5 / math.sqrt(trace + 1.0)
	w = 0.25 / s
	x = (m32 - m23) * s
	y = (m13 - m31) * s
	z = (m21 - m12) * s
	elif (m11 > m22) and (m11 > m33):
	s = 2.0 * math.sqrt(1.0 + m11 - m22 - m33)
	w = (m32 - m23) / s
	x = 0.25 * s
	y = (m12 + m21) / s
	z = (m13 + m31) / s
	elif m22 > m33:
	s = 2.0 * math.sqrt(1.0 + m22 - m11 - m33)
	w = (m13 - m31) / s
	x = (m12 + m21) / s
	y = 0.25 * s
	z = (m23 + m32) / s
	else:
	s = 2.0 * math.sqrt(1.0 + m33 - m11 - m22)
	w = (m21 - m12) / s
	x = (m13 + m31) / s
	y = (m23 + m32) / s
	z = 0.25 * s

	# Extract the translation part of the matrix (last column)
	tx = matrix[3]
	ty = matrix[7]
	tz = matrix[11]

	return {
	'rotation': [x, y, z, w],
	'translation': [tx, ty, tz],
	'scale': [scale_x, scale_y, scale_z]
	}


	def decode_file_json(file_json):

	file_dict = {}
	file_id = file_json[0]
	role_id = file_json[1]
	file_type_id = file_json[2][2]
	file_name = file_json[2][1]
	file_url = file_json[2][3]
	extension = file_name.rsplit(".", 1)[-1].lower()
	# print(f"{file_type_id} {extension} {file_name}")

	# 1747/4: Original glTF, Original OBJ, Updated glTF, USDZ
	# 618/6: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ
	# 303/4: Original FBX, Original glTF, Updated glTF, USDZ
	# 160/3: Original glTF, Updated glTF, USDZ
	# 158/3: Original OBJ, Updated glTF, USDZ
	# 149/4: GLB, Original glTF, Updated glTF, USDZ
	# 116/3: Original glTF, Original OBJ, Updated glTF
	# 50/3: Original FBX, Original glTF, Updated glTF
	# 41/5: Original FBX, Original glTF, Original OBJ, Updated glTF, USDZ Fil
	# 33/3: GLB, Original glTF, Updated glTF
	# 23/5: Original FBX, Original OBJ, Updated glTF, Original Tri OBJ, USDZ Fil
	# 20/3: Original FBX, Updated glTF, USDZ
	# 18/2: Original FBX, Original glTF
	# 16/5: Original FBX, Original glTF, Original OBJ, Updated glTF, Original Tri OBJ Fil
	# 14/2: Original glTF, Updated glTF
	# 12/5: Original FBX, Original glTF, Updated glTF, Original Tri OBJ, USDZ Fil
	# 7/3: GLB, Updated glTF, USDZ
	# 6/2: Updated glTF, USDZ
	# 3/2: GLB, Original glTF
	# 2/1: Original glTF
	# 1/4: Original FBX, Original OBJ, Updated glTF, USDZ
	# 1/3: Original FBX, Original glTF, USDZ
	# 1/2: Original glTF, Original OBJ
	# 1/5: Original glTF, Original OBJ, Updated glTF, Original Tri OBJ, USDZ


	# Updated glTF File: 2722
	# Original glTF File: 2572
	# USDZ File: 2542
	# Original OBJ File: 2156
	# Original FBX File: 846
	# Original Triangulated OBJ File: 517
	# GLB File: 145
	main_role = {
	1: "Original OBJ File",
	2: "Tilt File",
	4: "Unknown GLTF File",
	6: "Original FBX File",
	7: "Blocks File",
	8: "USD File",
	11: "HTML File",
	12: "Original glTF File",
	13: "Tour Creator Experience",
	15: "JSON File",
	16: "lullmodel File",
	17: "sand File",
	18: "GLB File",
	19: "sand File",
	20: "sandc File",
	21: "pb File",
	22: "Unknown GLTF File",
	24: "Original Triangulated OBJ File",
	25: "JPG (Buggy)",
	26: "USDZ File",
	30: "Updated glTF File",
	32: "Editor settings pb file",
	35: "Unknown GLTF File",
	36: "Unknown GLB File",
	38: "Unknown GLB File",
	}.get(role_id, None)

	if main_role is None:
	print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}")

	if role_id == 13:
	found_tour_experience_filetype = True
	# if role_id == 25:
	# print(f"Skipping buggy jpg role: {asset_id}")
	# continue # weird buggy jpg
	if main_role is None:
	print(f"Unknown main role in {asset_id}: {role_id} for {file_name.split('.')[-1]}")
	assert False
	# print(f"{main_role} for {file_name.split('.')[-1]}")
	# role = None
	# if len(file_json) > 5:
	# archive = file_json[5] # Either a zip or a usdz
	# archive_url = archive[3]
	# role = roles_by_url.get(archive_url, None)
	# print(role, main_file_type_id)

	# file_type_id=1: extension="obj"
	# file_type_id=2: extension="tilt"
	# file_type_id=4: extension="gltf"
	# file_type_id=5: extension="glb"
	# file_type_id=6: extension="fbx"
	# file_type_id=7: extension="blocks" or extension="poly"
	# file_type_id=8: extension="usd"
	# file_type_id=11: extension="html"
	# file_type_id=13: extension="experience"
	# file_type_id=15: extension="lullmodel"
	# file_type_id=16: extension="sand"
	# file_type_id=17: extension="sandc"
	# file_type_id=19: extension="usdz"
	# file_type_id=1000: extension="jpg"
	# file_type_id=3001: extension="zip"
	# file_type_id=3002: extension="json"
	# file_type_id=3012: extension="pb"
	# file_type_id=3016: extension="pb"

	if file_type_id == 1:
	format_type = "OBJ"
	assert (extension == "obj")
	elif file_type_id == 2:
	format_type = "TILT"
	assert (extension == "tilt")
	elif file_type_id == 4:
	format_type = "GLTF"
	assert (extension == "gltf")
	elif file_type_id == 5:
	format_type = "GLB"
	assert (extension == "glb")
	elif file_type_id == 6:
	format_type = "FBX"
	assert (extension == "fbx")
	elif file_type_id == 7:
	format_type = "BLOCKS"
	assert (extension == "blocks" or extension == "poly")
	elif file_type_id == 8:
	format_type = "USD"
	assert (extension == "usd")
	elif file_type_id == 11:
	format_type = "HTML"
	assert (extension == "html")
	elif file_type_id == 13:
	format_type = "TOUR"
	assert (extension == "experience")
	elif file_type_id == 15:
	format_type = "LULL"
	assert (extension == "lullmodel")
	elif file_type_id == 16:
	format_type = "SAND"
	assert (extension == "sand")
	elif file_type_id == 17:
	format_type = "SANDC"
	assert (extension == "sandc")
	elif file_type_id == 19:
	format_type = "USDZ"
	assert (extension == "usdz")
	elif file_type_id == 1000:
	format_type = "JPG"
	assert (extension == "jpg")
	elif file_type_id == 3001:
	format_type = "ZIP"
	assert (extension == "zip")
	elif file_type_id == 3002:
	format_type = "JSON"
	assert (extension == "json")
	elif file_type_id == 3012:
	format_type = "PB"
	assert (extension == "pb")
	elif file_type_id == 3016:
	format_type = "PB"
	assert (extension == "pb")
	else:
	format_type = "UNKNOWN"
	print(f"Unknown file type in {asset_id}: {file_type_id} for {file_name}")
	# break

	file_root = file_json[2][3] # 0 is an id, 1 is a filename, 2 is probably file type id?
	sub_files = file_json[3]
	# 4 is an int array
	file_dict["root"] = {}
	file_dict["root"]["id"] = file_id
	file_dict["root"]["url"] = file_url
	file_dict["root"]["role"] = main_role
	file_dict["formatComplexity"] = {
	"triangleCount": file_json[4][0],
	# "raw": file_json[4]
	}
	file_dict["formatType"] = format_type
	if len(file_json) > 5 and file_json[5][2] == 3001:
	# 5 is a zip or usdz
	file_dict["archive"] = {}
	file_dict["archive"]["id"] = file_json[5][0]
	file_dict["archive"]["filename"] = file_json[5][1]
	# file_dict["archive"]["type"] = file_json[5][2] == 3001 if "ZIP" else "USDZ"
	file_dict["archive"]["url"] = file_json[5][3]

	return file_dict


	def decode_subfile_json(subfile_json):
	file_dict = {
	"id": subfile_json[0],
	"filename": subfile_json[1],
	"url": subfile_json[3]
	}
	return file_dict


	all_jsonl_path = os.path.join(JSON_DIR, "all_data.jsonl")
	all_json_path = os.path.join(JSON_DIR, "all_data.json")
	os.makedirs(os.path.dirname(all_json_path), exist_ok=True)

	stats_set = set()
	i = 0

	with open(all_json_path, "w", encoding="utf-8") as all_json_outfile, open(all_jsonl_path, "w", encoding="utf-8") as all_jsonl_outfile:

	all_json_outfile.write("[")

	for dir_name in os.listdir(EMBED_DIR):

	embed_dir = os.path.join(EMBED_DIR, dir_name)
	gltf_dir = os.path.join(GLTF_DIR, dir_name)

	json_result_dict = {}
	if os.path.isdir(embed_dir):
	embed_path = os.path.join(embed_dir, "embed.html")
	# find the name of the first gltf file found in gltf_dir itself
	gltf_file_path = None
	if os.path.exists(gltf_dir):
	for gltf_file in os.listdir(gltf_dir):
	if gltf_file.endswith(".gltf"):
	gltf_file_path = os.path.join(gltf_dir, gltf_file)
	break

	json_file_path = os.path.join(JSON_DIR, dir_name, "data.json")
	if not os.path.exists(embed_path):
	print(f"No embed found for {embed_path}")
	continue

	# html_path = os.path.join(embed_dir, "index.html")
	# if not os.path.exists(html_path):
	# print(f"No html found for {embed_path}")
	# continue

	with open(embed_path, "r", encoding="utf-8", errors="replace") as embed:
	contents = embed.read()
	regex_results = regex_pattern.findall(contents, re.MULTILINE)

	if len(regex_results) < 1:
	print(f"No match for {embed_path}: {len(regex_results)}")
	continue

	try:
	json_data = json.loads(regex_results[0])

	######################

	if len(json_data[0]) < 2:
	print("No data: " + dir_name)
	continue

	asset_id = json_data[0][1][0]

	json_result_dict["name"] = json_data[0][1][1]
	json_result_dict["description"] = json_data[0][1][2]
	json_result_dict["thumbnail_url"] = json_data[0][1][3]

	viewer_data_container = json_data[0][1][4]
	if viewer_data_container is None:
	print("No viewer data: ", asset_id)
	continue
	viewer_data = [x for x in viewer_data_container if x is not None]
	possible_tour_creator = False
	found_tour_experience_filetype = False # for later check
	if len(viewer_data) == 1:
	possible_tour_creator = True
	# print("Viewer data length 1. Tour creator?: ", asset_id)
	# continue # Tour creator?

	# 5 is empty
	# some_bools = json_data[0][1][6] # len=8 [bool, False, bool, None, None, bool, bool, bool]
	# json_result_dict["category_id"] = json_data[0][1][7]
	category_id = json_data[0][1][7]
	if category_id == 0: print("Uncategorized: ", dir_name)
	json_result_dict["category"] = {
	0: "Uncategorized",
	1: "Art",
	2: "Animals & Pets",
	3: "Architecture",
	4: "Places & Scenes",
	5: "(Unused)",
	6: "Food & Drink",
	7: "Nature",
	8: "People & Characters",
	9: "Tools & Technology",
	10: "Transport",
	11: "Miscellaneous",
	12: "Objects",
	13: "Culture & Humanity",
	14: "Current Events",
	15: "Furniture & Home",
	16: "History",
	17: "Science",
	18: "Sports & Fitness",
	19: "Travel & Leisure",
	}[category_id]
	licence_id = json_data[0][1][8] # 0 = CC-BY-ND, 1 = CC-BY
	if licence_id != 0 and licence_id != 1:
	print("Unknown licence: ", asset_id, json_data[0][1][9])
	else:
	json_result_dict["licence"] = "CREATIVE_COMMONS_BY_ND" if licence_id == 0 else "CREATIVE_COMMONS_BY"

	visibility_id = json_data[0][1][9] # 2 = Unlisted, 3 = Public
	if visibility_id != 3 and visibility_id != 2:
	print("Unknown visibility: ", asset_id, json_data[0][1][9])
	else:
	json_result_dict["visibility"] = "UNLISTED" if visibility_id == 2 else "PUBLIC"
	# 9 = int
	# 10 = null
	# some_numeric_data = json_data[0][1][11] # Dunno what these are
	camera_transform = None
	camera_offset = None
	if json_data[0][1][11] is not None:
	camera_transform = json_data[0][1][11][0]
	camera_offset = json_data[0][1][11][1] # Maybe
	# float_data_3 = json_data[0][1][11][1] # empty or 3 floats
	# int0 = json_data[0][1][11][2] # mostly 0 but sometimes 45 or 80 - camera fov?
	# int1 = json_data[0][1][11][3] # mostly 0 but sometimes 1 or 3
	# some_int_data = json_data[0][1][12] # Dunno what these are
	json_result_dict["authorName"] = json_data[0][1][13]
	created_timestamp = float(json_data[0][1][14]) / 1000 / 1000
	dt = datetime.datetime.fromtimestamp(created_timestamp, tz=datetime.timezone.utc)
	created_timestamp = dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
	json_result_dict["createTime"] = created_timestamp
	json_result_dict["updateTime"] = created_timestamp

	json_result_dict["authorId"] = json_data[0][1][15] # Maybe?
	# 16 = bool
	# 17 = null
	# 18 = thumbnail url ?
	json_result_dict["likes"] = json_data[0][1][19]
	# 20 = null
	# background color processed after formats
	# 22 = int
	# 23 = null
	# 23 = []
	json_result_dict["tags"] = json_data[0][1][24]
	# some_id = json_data[0][1][25] # Dunno what this is
	# some_int_array = json_data[0][1][26] # Dunno what this is
	# 27
	json_result_dict["views"] = json_data[0][1][28] # ?
	# 29
	# 30
	# 31
	# 32
	# 33
	# 34
	formats_list = []
	files_array = json_data[0][1][35]

	color_space = "LINEAR"
	for file_json in files_array:
	files_dict = decode_file_json(file_json)
	resources = []
	for subfile in file_json[3]:
	subfile_dict = decode_subfile_json(subfile)
	resources.append(subfile_dict)
	if len(resources) > 0:
	files_dict["resources"] = resources
	formats_list.append(files_dict)
	if files_dict["formatType"] == "TILT":
	color_space = "GAMMA"

	json_result_dict["formats"] = formats_list

	gltf_path = None
	for format_dict in formats_list:
	if format_dict["root"]["role"].startswith("Updated glTF"):
	gltf_path = os.path.join(GLTF_DIR, dir_name, format_dict["root"]["url"].split("/")[-1])

	if gltf_path is not None and os.path.exists(gltf_path):
	with open(gltf_path, "r", encoding="utf-8", errors="replace") as gltf:
	gltf_contents = gltf.read()
	gltf_data = json.loads(gltf_contents)

	_extras = gltf_data.get("extras", {})

	# motionPath: FULL_ROTATION, SIDE_TO_SIDE, NONE
	json_result_dict["GOOGLE_initial_camera_motion"] = _extras.get("GOOGLE_initial_camera_motion", None)

	# disableShadows: true, false
	json_result_dict["GOOGLE_lighting_rig"] = _extras.get("GOOGLE_lighting_rig", None)

	_extensions = gltf_data["scenes"][0].get("extensions", {})

	# backgroundImage: {index: int}
	# color: [float, float, float]
	json_result_dict["GOOGLE_backgrounds"] = _extensions.get("GOOGLE_backgrounds", None)

	# scaling_factor: float
	json_result_dict["GOOGLE_real_world_transform"] = _extensions.get("GOOGLE_real_world_transform", None)

	# rotation: [float, float, float, float]
	json_result_dict["GOOGLE_scene_rotation"] = _extensions.get("GOOGLE_scene_rotation", None)

	# light: int
	json_result_dict["GOOGLE_lights_image_based"] = _extensions.get("GOOGLE_lights_image_based", None)

	_scene_extras = gltf_data["scenes"][0].get("extras", {})
	GOOGLE_camera_index = _scene_extras.get("GOOGLE_camera_index", {}).get("nodeIndex", None)
	camera = {}
	cameras = gltf_data.get("cameras", [])
	if len(cameras) > 0:
	camera_node = None
	for node in gltf_data["nodes"]:
	if node.get("camera", None) is not None:
	camera_node = node
	break
	if camera_node is not None:
	if "matrix" in camera_node:
	tr = decompose_matrix(camera_node["matrix"])
	cameras[0]["translation"] = tr["translation"]
	cameras[0]["rotation"] = tr["rotation"]
	elif "translation" in camera_node:
	cameras[0]["translation"] = camera_node["translation"]
	if "rotation" in camera_node:
	cameras[0]["rotation"] = camera_node["rotation"]
	else:
	print("Translation without rotation", dir_name)
	else:
	print("Unknown camera transform type", dir_name)
	camera.update(cameras[0])

	# panLimits: min: [float, float, float], max: [float, float, float]
	# yawLimits: {min: float, max: float}
	# pitchLimits: {min: float, max: float}
	# distanceLimits: {min: float, max: float}
	# mode: orbit, moveableOrbit
	# pivot: [float, float, float]
	camera["GOOGLE_camera_settings"] = _extensions.get("GOOGLE_camera_settings", None)

	json_result_dict["camera"] = cameras

	# groundColor: [float, float, float]
	json_result_dict["GOOGLE_hemi_light"] = _scene_extras.get("GOOGLE_hemi_light", None)

	# visualCenterPoint = GOOGLE_geometry_data.get("visualCenterPoint", None) # 3 floats
	# stats = GOOGLE_geometry_data.get("stats", {})
	# radius = stats.get("radius", None) # float
	# stdev = stats.get("stdev", None) # float
	# centroid = stats.get("centroid", None) # 3 floats
	json_result_dict["GOOGLE_geometry_data"] = _scene_extras.get("GOOGLE_geometry_data", None)


	if json_data[0][1][21] is not None and len(json_data[0][1][21]) == 1:
	json_result_dict["presentationParams"] = {
	"orientingRotation": {"w": 1},
	"colorSpace": color_space,
	"backgroundColor": json_data[0][1][21][0]
	}



	# 36 = int array, 37 = int array, 38 = textures?

	######################

	# 2 is just the upload url
	# 3 bool
	# 4 bool
	# 5 bool
	# 6 has 12 elements

	######################

	# some_other_id = json_data[0][6][0] # Probably asset id again
	# 1 is the title again
	# 2 is probably the same as viewer_data

	json.dump({f"{asset_id}": json_result_dict}, all_json_outfile, indent=4)
	all_json_outfile.write(",\n")

	json_result_dict["assetId"] = asset_id
	json.dump(json_result_dict, all_jsonl_outfile)
	all_jsonl_outfile.write("\n")

	os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
	with open(json_file_path, "w", encoding="utf-8") as json_outfile:
	json.dump(json_result_dict, json_outfile, indent=4)

	except json.decoder.JSONDecodeError as e:
	print(f"Error parsing json: {embed_path}")
	continue

	except json.decoder.JSONDecodeError as e:
	print(f"Unknown error for {embed_path}: {e}")
	continue

	all_json_outfile.write("]")

	# if len(regex_results) != 1:
	# print(len(regex_results), path_name)