Format ubyte merupakan format binary untuk menyimpan dataset yang terkompresi. Kalian tentu tahu donk dataset MNIST sebagai benchmark dalam menguji algoritma deep learning CNN. Bisa kalian peroleh image dan label nya di googleapis.com melalui link berikut
Untuk dataset training
https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz
https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz
untuk dataset testing
https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz
https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz
Setelah kalian download dan extract menggunakan winrar maka terdapat file dengan format ubyte

Oiya, saya menggunakan tensorflow 2.4.1 ya
Setelah kalian download (tidak perlu extract, karena kita pakai gzip), berikut untuk menampilkan gambarnya
import gzip
f = gzip.open('train-images-idx3-ubyte.gz','r')
image_size = 28 #ukuran gambar!
num_images = 100 #jumlah gambar yang akan diambil
import numpy as np
f.read(16)
buf = f.read(image_size * image_size * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = data.reshape(num_images, image_size, image_size, 1)
# no gambar yang akan diambil
no_gambar = 10
import matplotlib.pyplot as plt
image = np.asarray(data[no_gambar]).squeeze()
plt.imshow(image)
plt.show()
#untuk baca label
f = gzip.open('train-labels-idx1-ubyte.gz','r')
f.read(8)
for i in range(0,num_images):
buf = f.read(1)
labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
print(labels)
Ref: https://stackoverflow.com/questions/40427435/extract-images-from-idx3-ubyte-file-or-gzip-via-python
Script Download
Contents
Untuk cara kedua dengan langsung download. Kalian bisa menggunakan kode berikut untuk melakukan download di python
import os
from six.moves import urllib
import tensorflow as tf
import gzip
import numpy
import sys
def download(directory, filename):
"""Download a file from the MNIST dataset if not already done."""
filepath = os.path.join(directory, filename)
if tf.io.gfile.exists(filepath):
return filepath
if not tf.io.gfile.exists(directory):
tf.io.gfile.mkdir(directory)
# CVDF mirror of http://yann.lecun.com/exdb/mnist/
url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
temp_file_name, _ = urllib.request.urlretrieve(url)
tf.io.gfile.copy(temp_file_name, filepath)
with tf.io.gfile.GFile(filepath) as f:
size = f.size()
print('Successfully downloaded', filename, size, 'bytes.')
return filepath
directory='.'
images_file = 'train-images-idx3-ubyte'
labels_file = 'train-labels-idx1-ubyte'
result_images_file = download(directory, images_file)
result_labels_file = download(directory, labels_file)
Cari tahu jumlah records yang ada
Setelah didownload, kita bisa langsung baca dan sekaligus disiapkan untuk dibuat TFRecord, kita buat parsing data menggunakan cara pertama namun menggunakan _read32() untuk mengetahui jumlah data yang ada didalam file tersebut.
Oiya, kita butuh dense_to_one_hot() sebagai outputnya (bila kalian ingin tahu apa itu one hot bisa pelajari ini https://softscients.com/2020/11/06/belajar-membuat-desain-neural-network-dengan-tensorflow/)
def dense_to_one_hot(labels_dense, num_classes): """Convert class labels from scalars to one-hot vectors.""" num_labels = labels_dense.shape[0] index_offset = numpy.arange(num_labels) * num_classes labels_one_hot = numpy.zeros((num_labels, num_classes)) labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 return labels_one_hot
Selanjutnya kita buat extract_images() dan extract_labels() untuk unzip dan membaca ubyte nya
def extract_images(result_images_file):
"""Extract the images into a 4D uint8 numpy array [index, y, x, depth].
"""
with gzip.open(result_images_file,'rb') as bytestream:
magic = _read32(bytestream)
if magic != 2051:
raise ValueError('Invalid magic number %d in MNIST image file: %s' %
(magic, result_images_file))
num_images = _read32(bytestream) #untuk dapatkan jumlah gambar! secara otomatis
rows = _read32(bytestream)
cols = _read32(bytestream)
buf = bytestream.read(rows * cols * num_images)
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_images, rows, cols, 1)
return data
def extract_labels(result_labels_file, one_hot=False, num_classes=10):
"""Extract the labels into a 1D uint8 numpy array [index].
labels: a 1D uint8 numpy array.
Raises:
ValueError: If the bystream doesn't start with 2049.
"""
with gzip.open(result_labels_file,'rb') as bytestream:
magic = _read32(bytestream)
if magic != 2049:
raise ValueError('Invalid magic number %d in MNIST label file: %s' %
(magic, result_labels_file))
num_items = _read32(bytestream)
buf = bytestream.read(num_items)
labels = numpy.frombuffer(buf, dtype=numpy.uint8)
if one_hot:
return dense_to_one_hot(labels, num_classes)
return labels
Langsung saja kita panggil
train_images = extract_images(result_images_file) train_labels = extract_labels(result_labels_file,one_hot=True)
Membuat TFRecords
Membuat helper functions to parse int and bytes features terlebih dahulu / encoding
def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
Kita ambil sampel 1.000 saja jangan banyak2
jumlah_data = 1000 #ambil segitu saja train_images = train_images[:jumlah_data] train_labels = train_labels[:jumlah_data]
Mari kita coba 1 sample saja, simpan dengan nama train_single.tfrecords
#untuk 1 sample saja
image = train_images[0]
image_label = train_labels[0]
_, rows, cols, depth = train_images.shape
filename = "train_single.tfrecords"
with tf.io.TFRecordWriter(filename) as writer:
image_raw = image.tostring()
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(rows),
'width': _int64_feature(cols),
'depth': _int64_feature(depth),
'label': _int64_feature(int(image_label)),
'image_raw': _bytes_feature(image_raw)
}))
writer.write(example.SerializeToString())
Atau untuk semua sekaligus, gunakan ini simpan denganm nama train.tfrecords
#untuk semua file!
filename = "train.tfrecords"
num_examples, rows, cols, depth = train_images.shape
data_set = list(zip(train_images, train_labels))
dataset_length = len(data_set)
with tf.io.TFRecordWriter(filename) as writer:
for index, (image, label) in enumerate(data_set):
sys.stdout.write(f"\rProcessing sample {index+1} of {dataset_length}")
sys.stdout.flush()
image_raw = image.tostring()
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(rows),
'width': _int64_feature(cols),
'depth': _int64_feature(depth),
'label': _int64_feature(int(label)),
'image_raw': _bytes_feature(image_raw)
}))
writer.write(example.SerializeToString())
Membaca TFRecords
Untuk membaca TFRecord kita butuh description/schema, kita buat saja untuk ambil data height dan width serta image_raw nya
import tensorflow as tf
from matplotlib import pyplot as plt
import numpy as np
# Read the data back out.
def decode_fn(record_bytes):
return tf.io.parse_single_example(
# Data
record_bytes,
# Schema
{"height": tf.io.FixedLenFeature([], dtype=tf.int64),
"width": tf.io.FixedLenFeature([], dtype=tf.int64),
"label":tf.io.FixedLenFeature([], dtype=tf.int64),
'image_raw': tf.io.FixedLenFeature([], tf.string)}
)
ingat ya! image_raw nya didalam encoding disimpan dalam format string, oleh karena itu butuh mekanisme khusus! Sekarang kita baca yang train_single.tfrecords saja
filename = 'train_single.tfrecords'
for batch in tf.data.TFRecordDataset(filename).map(decode_fn):
#print(batch['width'])
print("x = {height:.0f}, y = {width:.0f}, label={label:.0f}".format(**batch))
image = tf.io.decode_raw(batch['image_raw'], tf.uint8)
plt.figure(figsize=(7, 7))
plt.imshow(image.numpy().reshape([28,28]))
plt.show()
perhatikan bahwa image_raw harus diconvert ke uin8 dan jangan lupa di reshape!

Lebih lanjut bila ingin tahu mengenai TF Record Dataset