Create dataset and task - tiniest imagenet¶
- An example of how to create a custom dataset and task using the OpenML API and upload it to the OpenML server.
- Note that you must have an API key from the OpenML website to upload datasets and tasks.
In [2]:
Copied!
import openml
import numpy as np
import pandas as pd
import sklearn.datasets
import openml
from openml.datasets.functions import create_dataset
import os
import requests
import zipfile
import glob
import openml
import numpy as np
import pandas as pd
import sklearn.datasets
import openml
from openml.datasets.functions import create_dataset
import os
import requests
import zipfile
import glob
Create dataset on OpenML¶
- Instead of making our own, we obtain a subset of the ImageNet dataset from Stanford. This dataset has 200 classes.
In [9]:
Copied!
def create_tiny_imagenet():
dir_name = "datasets"
os.makedirs(dir_name, exist_ok=True)
# download the dataset
url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
r = requests.get(url, stream=True)
if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
f.write(r.content)
with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
zip_ref.extractall(f"{dir_name}/")
## recusively find all the images
image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
## remove the first part of the path
image_paths = [path.split("/", 1)[-1] for path in image_paths]
## create a dataframe with the image path and the label
label_func = lambda x: x.split("/")[2]
df = pd.DataFrame(image_paths, columns=["image_path"])
df["label"] = df["image_path"].apply(label_func)
## encode the labels as integers
# df["Class_encoded"] = pd.factorize(df["label"])[0]
## encode types
df["image_path"] = df["image_path"].astype("string")
df["label"] = df["label"].astype("string")
name = "tiny-imagenet-200"
attribute_names = df.columns
description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")
tinyim = create_dataset(
name = name,
description = description,
creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
collection_date = "2017",
language= "English",
licence="DbCL v1.0",
default_target_attribute="label",
attributes="auto",
data=df,
citation=citation,
ignore_attribute=None
)
openml.config.apikey = ''
tinyim.publish()
print(f"URL for dataset: {tinyim.openml_url}")
def create_tiny_imagenet():
dir_name = "datasets"
os.makedirs(dir_name, exist_ok=True)
# download the dataset
url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
r = requests.get(url, stream=True)
if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
f.write(r.content)
with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
zip_ref.extractall(f"{dir_name}/")
## recusively find all the images
image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
## remove the first part of the path
image_paths = [path.split("/", 1)[-1] for path in image_paths]
## create a dataframe with the image path and the label
label_func = lambda x: x.split("/")[2]
df = pd.DataFrame(image_paths, columns=["image_path"])
df["label"] = df["image_path"].apply(label_func)
## encode the labels as integers
# df["Class_encoded"] = pd.factorize(df["label"])[0]
## encode types
df["image_path"] = df["image_path"].astype("string")
df["label"] = df["label"].astype("string")
name = "tiny-imagenet-200"
attribute_names = df.columns
description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")
tinyim = create_dataset(
name = name,
description = description,
creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
collection_date = "2017",
language= "English",
licence="DbCL v1.0",
default_target_attribute="label",
attributes="auto",
data=df,
citation=citation,
ignore_attribute=None
)
openml.config.apikey = ''
tinyim.publish()
print(f"URL for dataset: {tinyim.openml_url}")
In [10]:
Copied!
create_tiny_imagenet()
# https://www.openml.org/d/46346
create_tiny_imagenet()
# https://www.openml.org/d/46346
Another, even tinier dataset¶
- We subset the previous dataset to 20 images per class.
In [19]:
Copied!
def create_tiniest_imagenet():
dir_name = "datasets"
os.makedirs(dir_name, exist_ok=True)
# download the dataset
url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
r = requests.get(url, stream=True)
if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
f.write(r.content)
with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
zip_ref.extractall(f"{dir_name}/")
## recusively find all the images
image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
## remove the first part of the path
image_paths = [path.split("/", 1)[-1] for path in image_paths]
image_paths[-1]
## create a dataframe with the image path and the label
label_func = lambda x: x.split("/")[2]
df = pd.DataFrame(image_paths, columns=["image_path"])
df["label"] = df["image_path"].apply(label_func)
## encode types
df["image_path"] = df["image_path"].astype("string")
df["label"] = df["label"].astype("string")
# keep only first 20 images for each label
df = df.groupby("label").head(20)
name = "tiniest-imagenet-200"
attribute_names = df.columns
description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. !!! This dataset only links to 20 images per class (instead of the usual 500) and is ONLY for quickly testing a framework. !!! Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")
tinyim = create_dataset(
name = name,
description = description,
creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
collection_date = "2017",
language= "English",
licence="DbCL v1.0",
default_target_attribute="label",
attributes="auto",
data=df,
citation=citation,
ignore_attribute=None
)
openml.config.apikey = ''
tinyim.publish()
print(f"URL for dataset: {tinyim.openml_url}")
def create_tiniest_imagenet():
dir_name = "datasets"
os.makedirs(dir_name, exist_ok=True)
# download the dataset
url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
r = requests.get(url, stream=True)
if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
f.write(r.content)
with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
zip_ref.extractall(f"{dir_name}/")
## recusively find all the images
image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
## remove the first part of the path
image_paths = [path.split("/", 1)[-1] for path in image_paths]
image_paths[-1]
## create a dataframe with the image path and the label
label_func = lambda x: x.split("/")[2]
df = pd.DataFrame(image_paths, columns=["image_path"])
df["label"] = df["image_path"].apply(label_func)
## encode types
df["image_path"] = df["image_path"].astype("string")
df["label"] = df["label"].astype("string")
# keep only first 20 images for each label
df = df.groupby("label").head(20)
name = "tiniest-imagenet-200"
attribute_names = df.columns
description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. !!! This dataset only links to 20 images per class (instead of the usual 500) and is ONLY for quickly testing a framework. !!! Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")
tinyim = create_dataset(
name = name,
description = description,
creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
collection_date = "2017",
language= "English",
licence="DbCL v1.0",
default_target_attribute="label",
attributes="auto",
data=df,
citation=citation,
ignore_attribute=None
)
openml.config.apikey = ''
tinyim.publish()
print(f"URL for dataset: {tinyim.openml_url}")
In [20]:
Copied!
create_tiniest_imagenet()
# https://www.openml.org/d/46347
create_tiniest_imagenet()
# https://www.openml.org/d/46347
URL for dataset: https://www.openml.org/d/46347
Create task on OpenML¶
- Now to actually use the OpenML Pytorch API, we need to have a task associated with the dataset. This is how we create it.
In [27]:
Copied!
def create_task():
# Define task parameters
task_type = openml.tasks.TaskType.SUPERVISED_CLASSIFICATION
dataset_id = 46347 # Obtained from the dataset creation step
evaluation_measure = 'predictive_accuracy'
target_name = 'label'
class_labels = list(pd.read_csv("datasets/tiniest_imagenet.csv")["label"].unique())
cost_matrix = None
# Create the task
new_task = openml.tasks.create_task(
task_type=task_type,
dataset_id=dataset_id,
estimation_procedure_id = 1,
evaluation_measure=evaluation_measure,
target_name=target_name,
class_labels=class_labels,
cost_matrix=cost_matrix
)
openml.config.apikey = ''
new_task.publish()
print(f"URL for task: {new_task.openml_url}")
def create_task():
# Define task parameters
task_type = openml.tasks.TaskType.SUPERVISED_CLASSIFICATION
dataset_id = 46347 # Obtained from the dataset creation step
evaluation_measure = 'predictive_accuracy'
target_name = 'label'
class_labels = list(pd.read_csv("datasets/tiniest_imagenet.csv")["label"].unique())
cost_matrix = None
# Create the task
new_task = openml.tasks.create_task(
task_type=task_type,
dataset_id=dataset_id,
estimation_procedure_id = 1,
evaluation_measure=evaluation_measure,
target_name=target_name,
class_labels=class_labels,
cost_matrix=cost_matrix
)
openml.config.apikey = ''
new_task.publish()
print(f"URL for task: {new_task.openml_url}")
In [28]:
Copied!
create_task()
# https://www.openml.org/t/362128
create_task()
# https://www.openml.org/t/362128
URL for task: https://www.openml.org/t/362128