giangnguyen2412/cub200_class_definition.py

## cub200_class_definition.py
# Create definitions for CUB-200 (CUB200 category definition - CUB200 class definition)
# All .txt files can be found here: https://www.kaggle.com/datasets/veeralakrishna/200-bird-species-with-11788-images?resource=download&select=CUB_200_2011.tgz
import re
import numpy as np
import pickle

# Read the class ID for each image. We have 11788 images, each image has a class ID from 1->200
img2class_file = 'image_class_labels.txt'
input_f = open(img2class_file)
img2class_dict = {}
for line in input_f:
    img_id = int(line.split(' ')[0])
    bird_id = int(line.split(' ')[1])
    img2class_dict[img_id] = bird_id

# Read the attribution for all images. We have 312 (binary) features for all 200 bird classes. E.g. pointy beak
attribute_name_file = 'attributes.txt'
input_f = open(attribute_name_file)
class_att_dict = {}
for line in input_f:
    attr_id = int(line.split(' ')[0])
    att_noun = ((line.split(' ')[1]).split('::')[0])[4:] # Example: Remove has_ from has_bill_shape
    att_adj = (line.split(' ')[1]).split('::')[1]
    class_att_dict[attr_id] = [att_noun, att_adj]

# Read the ID to text mapping of bird classes. We have 200 entries
class_name_file = 'classes.txt'
input_f = open(class_name_file)
class_name_dict = {}
for line in input_f:
    bird_id = int(line.split(' ')[0])
    bird_name = line.split(' ')[1]
    class_name_dict[bird_id] = bird_name

# Read the attribution for each image. We have 11788 images -> 11788 entries
attribute_name_file = 'image_attribute_labels.txt'
input_f = open(attribute_name_file)
text_att_dict = {}
for line in input_f:
    vals = line.split(' ')
    if int(vals [2]) == 1:
        bird_id = int(vals[0])
        attr_id = int(vals[1])
    else:
        continue

    if bird_id not in text_att_dict:
        text_att_dict[bird_id] = [class_att_dict[attr_id]]
    else:
        text_att_dict[bird_id].append(class_att_dict[attr_id])

# Now we simplify to only 200 entries corresp. to 200 bird classes. Each entry has a number of attributes
# For each category, we randomly pick one image to represent the attribute of that category (i.e., 11788 -> 200)
definition_dict = {}
for key, val in text_att_dict.items():
    bird_id = img2class_dict[key]
    definition = val
    definition_dict[bird_id] = definition

# Now process each entry of definition_dict to get the sentence-complete definition for each bird species
text_definition_dict = {}
for key, vals in definition_dict.items():
    key = class_name_dict[key].replace('\n', '')
    definition = 'Birds having '
    for idx, val in enumerate(vals):
        # We append the '.' instead of ',' after the last attribute
        # We also truncate to only less than 7 attributes
        if val == vals[-1] or idx == 7:
            definition += (val[1].replace('\n', ' ')).replace('_', '-') + val[0].replace('_', ' ') + '.'
            break
        else:
            definition += (val[1].replace('\n', ' ')).replace('_', '-') + val[0].replace('_', ' ') + ', '
    text_definition_dict[key] = definition

with open('CUB200_definition.pickle', 'wb') as f:
    pickle.dump(text_definition_dict, f)
	# Create definitions for CUB-200 (CUB200 category definition - CUB200 class definition)
	# All .txt files can be found here: https://www.kaggle.com/datasets/veeralakrishna/200-bird-species-with-11788-images?resource=download&select=CUB_200_2011.tgz
	import re
	import numpy as np
	import pickle

	# Read the class ID for each image. We have 11788 images, each image has a class ID from 1->200
	img2class_file = 'image_class_labels.txt'
	input_f = open(img2class_file)
	img2class_dict = {}
	for line in input_f:
	img_id = int(line.split(' ')[0])
	bird_id = int(line.split(' ')[1])
	img2class_dict[img_id] = bird_id

	# Read the attribution for all images. We have 312 (binary) features for all 200 bird classes. E.g. pointy beak
	attribute_name_file = 'attributes.txt'
	input_f = open(attribute_name_file)
	class_att_dict = {}
	for line in input_f:
	attr_id = int(line.split(' ')[0])
	att_noun = ((line.split(' ')[1]).split('::')[0])[4:] # Example: Remove has_ from has_bill_shape
	att_adj = (line.split(' ')[1]).split('::')[1]
	class_att_dict[attr_id] = [att_noun, att_adj]

	# Read the ID to text mapping of bird classes. We have 200 entries
	class_name_file = 'classes.txt'
	input_f = open(class_name_file)
	class_name_dict = {}
	for line in input_f:
	bird_id = int(line.split(' ')[0])
	bird_name = line.split(' ')[1]
	class_name_dict[bird_id] = bird_name

	# Read the attribution for each image. We have 11788 images -> 11788 entries
	attribute_name_file = 'image_attribute_labels.txt'
	input_f = open(attribute_name_file)
	text_att_dict = {}
	for line in input_f:
	vals = line.split(' ')
	if int(vals [2]) == 1:
	bird_id = int(vals[0])
	attr_id = int(vals[1])
	else:
	continue

	if bird_id not in text_att_dict:
	text_att_dict[bird_id] = [class_att_dict[attr_id]]
	else:
	text_att_dict[bird_id].append(class_att_dict[attr_id])

	# Now we simplify to only 200 entries corresp. to 200 bird classes. Each entry has a number of attributes
	# For each category, we randomly pick one image to represent the attribute of that category (i.e., 11788 -> 200)
	definition_dict = {}
	for key, val in text_att_dict.items():
	bird_id = img2class_dict[key]
	definition = val
	definition_dict[bird_id] = definition

	# Now process each entry of definition_dict to get the sentence-complete definition for each bird species
	text_definition_dict = {}
	for key, vals in definition_dict.items():
	key = class_name_dict[key].replace('\n', '')
	definition = 'Birds having '
	for idx, val in enumerate(vals):
	# We append the '.' instead of ',' after the last attribute
	# We also truncate to only less than 7 attributes
	if val == vals[-1] or idx == 7:
	definition += (val[1].replace('\n', ' ')).replace('_', '-') + val[0].replace('_', ' ') + '.'
	break
	else:
	definition += (val[1].replace('\n', ' ')).replace('_', '-') + val[0].replace('_', ' ') + ', '
	text_definition_dict[key] = definition

	with open('CUB200_definition.pickle', 'wb') as f:
	pickle.dump(text_definition_dict, f)