-
-
Save swyoon/8185b3dcf08ec728fb22b99016dd533f to your computer and use it in GitHub Desktop.
import numpy as np | |
import tensorflow as tf | |
__author__ = "Sangwoong Yoon" | |
def np_to_tfrecords(X, Y, file_path_prefix, verbose=True): | |
""" | |
Converts a Numpy array (or two Numpy arrays) into a tfrecord file. | |
For supervised learning, feed training inputs to X and training labels to Y. | |
For unsupervised learning, only feed training inputs to X, and feed None to Y. | |
The length of the first dimensions of X and Y should be the number of samples. | |
Parameters | |
---------- | |
X : numpy.ndarray of rank 2 | |
Numpy array for training inputs. Its dtype should be float32, float64, or int64. | |
If X has a higher rank, it should be rshape before fed to this function. | |
Y : numpy.ndarray of rank 2 or None | |
Numpy array for training labels. Its dtype should be float32, float64, or int64. | |
None if there is no label array. | |
file_path_prefix : str | |
The path and name of the resulting tfrecord file to be generated, without '.tfrecords' | |
verbose : bool | |
If true, progress is reported. | |
Raises | |
------ | |
ValueError | |
If input type is not float (64 or 32) or int. | |
""" | |
def _dtype_feature(ndarray): | |
"""match appropriate tf.train.Feature class with dtype of ndarray. """ | |
assert isinstance(ndarray, np.ndarray) | |
dtype_ = ndarray.dtype | |
if dtype_ == np.float64 or dtype_ == np.float32: | |
return lambda array: tf.train.Feature(float_list=tf.train.FloatList(value=array)) | |
elif dtype_ == np.int64: | |
return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array)) | |
else: | |
raise ValueError("The input should be numpy ndarray. \ | |
Instaed got {}".format(ndarray.dtype)) | |
assert isinstance(X, np.ndarray) | |
assert len(X.shape) == 2 # If X has a higher rank, | |
# it should be rshape before fed to this function. | |
assert isinstance(Y, np.ndarray) or Y is None | |
# load appropriate tf.train.Feature class depending on dtype | |
dtype_feature_x = _dtype_feature(X) | |
if Y is not None: | |
assert X.shape[0] == Y.shape[0] | |
assert len(Y.shape) == 2 | |
dtype_feature_y = _dtype_feature(Y) | |
# Generate tfrecord writer | |
result_tf_file = file_path_prefix + '.tfrecords' | |
writer = tf.python_io.TFRecordWriter(result_tf_file) | |
if verbose: | |
print "Serializing {:d} examples into {}".format(X.shape[0], result_tf_file) | |
# iterate over each sample, | |
# and serialize it as ProtoBuf. | |
for idx in range(X.shape[0]): | |
x = X[idx] | |
if Y is not None: | |
y = Y[idx] | |
d_feature = {} | |
d_feature['X'] = dtype_feature_x(x) | |
if Y is not None: | |
d_feature['Y'] = dtype_feature_y(y) | |
features = tf.train.Features(feature=d_feature) | |
example = tf.train.Example(features=features) | |
serialized = example.SerializeToString() | |
writer.write(serialized) | |
if verbose: | |
print "Writing {} done!".format(result_tf_file) | |
################################# | |
## Test and Use Cases ## | |
################################# | |
# 1-1. Saving a dataset with input and label (supervised learning) | |
xx = np.random.randn(10,5) | |
yy = np.random.randn(10,1) | |
np_to_tfrecords(xx, yy, 'test1', verbose=True) | |
# 1-2. Check if the data is stored correctly | |
# open the saved file and check the first entries | |
for serialized_example in tf.python_io.tf_record_iterator('test1.tfrecords'): | |
example = tf.train.Example() | |
example.ParseFromString(serialized_example) | |
x_1 = np.array(example.features.feature['X'].float_list.value) | |
y_1 = np.array(example.features.feature['Y'].float_list.value) | |
break | |
# the numbers may be slightly different because of the floating point error. | |
print xx[0] | |
print x_1 | |
print yy[0] | |
print y_1 | |
# 2. Saving a dataset with only inputs (unsupervised learning) | |
xx = np.random.randn(100,100) | |
np_to_tfrecords(xx, None, 'test2', verbose=True) |
If someone is looking for a faster solution check my Kaggle kernel
If someone is looking for a faster solution check my Kaggle kernel
@tchaye56
Well that notebook(version 17) has an error.
But I'm glad that someone is working on a faster version :-)
Wait for 10 min I just sent a new commit
Thanks for this.
Please, after saving test1, how can I recover the whole array, i.e xx and not just x_1? Thanks in advance.
What about just using
tensor = tf.convert_to_tensor(array) result = tf.io.serialize_tensor(tensor)
How would 1-2 look like with the tf updates? tf_record_iterator does not work anymore, and I can't find out how to recover the file.
why is it important for len(X.shape) == 2?
thanks for your sharing! may i know what if i have X with higher dimenions? is that still possible to covert them to tfrecords? thanks!
It's been so long since I wrote this code and personally, I have moved to pytorch.
I have no idea if this would work for tensors with a higher rank.
My personal guess is it should probably work, if you comment out the assertions (L45, L53).
Could you try it? @songssssss
thanks for your reply! i tried to use tf.data.Dataset instead and it solved my memory issue
Thanks for the code. I'm using TF version 2.15.0
I received this error:
module 'tensorflow' has no attribute 'python_io'
I fixed it by changing tf.python_io to tf.io. However, I couldn't find a replacement of "tf_record_itorator", but I guess it is not needed.
I also received another error:
return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: Value must be iterable
Thank you very much. But your code is very slow for huge datasets I think this is because you write the records one by one. I don't know if it is possible to write the whole data
using tf.train.SequenceExample
?