Create dataset and task - tiniest imagenet¶

An example of how to create a custom dataset and task using the OpenML API and upload it to the OpenML server.
Note that you must have an API key from the OpenML website to upload datasets and tasks.

In [2]:

Copied!





import openml

import numpy as np
import pandas as pd
import sklearn.datasets

import openml
from openml.datasets.functions import create_dataset
import os
import requests
import zipfile
import glob
import openml

import numpy as np
import pandas as pd
import sklearn.datasets

import openml
from openml.datasets.functions import create_dataset
import os
import requests
import zipfile
import glob

Create dataset on OpenML¶

Instead of making our own, we obtain a subset of the ImageNet dataset from Stanford. This dataset has 200 classes.

In [9]:

Copied!





def create_tiny_imagenet():
    dir_name = "datasets"
    os.makedirs(dir_name, exist_ok=True)

    # download the dataset
    url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
    r = requests.get(url, stream=True)

    if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
        with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
            f.write(r.content)

        with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
            zip_ref.extractall(f"{dir_name}/")
    ## recusively find all the images
    image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
    ## remove the first part of the path
    image_paths = [path.split("/", 1)[-1] for path in image_paths]
    ## create a dataframe with the image path and the label
    label_func = lambda x: x.split("/")[2]
    df = pd.DataFrame(image_paths, columns=["image_path"])
    df["label"] = df["image_path"].apply(label_func)
    ## encode the labels as integers
    # df["Class_encoded"] = pd.factorize(df["label"])[0]

    ## encode types
    df["image_path"] = df["image_path"].astype("string")
    df["label"] = df["label"].astype("string")


    name = "tiny-imagenet-200"
    attribute_names = df.columns
    description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
    paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
    citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")

    tinyim = create_dataset(
        name = name,
        description = description,
        creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        collection_date = "2017",
        language= "English",
        licence="DbCL v1.0",
        default_target_attribute="label",
        attributes="auto",
        data=df,
        citation=citation,
        ignore_attribute=None
    )
    openml.config.apikey = ''
    tinyim.publish()
    print(f"URL for dataset: {tinyim.openml_url}")
def create_tiny_imagenet():
    dir_name = "datasets"
    os.makedirs(dir_name, exist_ok=True)

    # download the dataset
    url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
    r = requests.get(url, stream=True)

    if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
        with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
            f.write(r.content)

        with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
            zip_ref.extractall(f"{dir_name}/")
    ## recusively find all the images
    image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
    ## remove the first part of the path
    image_paths = [path.split("/", 1)[-1] for path in image_paths]
    ## create a dataframe with the image path and the label
    label_func = lambda x: x.split("/")[2]
    df = pd.DataFrame(image_paths, columns=["image_path"])
    df["label"] = df["image_path"].apply(label_func)
    ## encode the labels as integers
    # df["Class_encoded"] = pd.factorize(df["label"])[0]

    ## encode types
    df["image_path"] = df["image_path"].astype("string")
    df["label"] = df["label"].astype("string")


    name = "tiny-imagenet-200"
    attribute_names = df.columns
    description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
    paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
    citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")

    tinyim = create_dataset(
        name = name,
        description = description,
        creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        collection_date = "2017",
        language= "English",
        licence="DbCL v1.0",
        default_target_attribute="label",
        attributes="auto",
        data=df,
        citation=citation,
        ignore_attribute=None
    )
    openml.config.apikey = ''
    tinyim.publish()
    print(f"URL for dataset: {tinyim.openml_url}")

In [10]:

Copied!

create_tiny_imagenet()
# https://www.openml.org/d/46346
create_tiny_imagenet()
# https://www.openml.org/d/46346

Another, even tinier dataset¶

We subset the previous dataset to 20 images per class.

In [19]:

Copied!





def create_tiniest_imagenet():
    dir_name = "datasets"
    os.makedirs(dir_name, exist_ok=True)

    # download the dataset
    url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
    r = requests.get(url, stream=True)

    if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
        with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
            f.write(r.content)

        with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
            zip_ref.extractall(f"{dir_name}/")
    ## recusively find all the images
    image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
    ## remove the first part of the path
    image_paths = [path.split("/", 1)[-1] for path in image_paths]
    image_paths[-1]
    ## create a dataframe with the image path and the label
    label_func = lambda x: x.split("/")[2]
    df = pd.DataFrame(image_paths, columns=["image_path"])
    df["label"] = df["image_path"].apply(label_func)
    ## encode types
    df["image_path"] = df["image_path"].astype("string")
    df["label"] = df["label"].astype("string")

    # keep only first 20 images for each label
    df = df.groupby("label").head(20)


    name = "tiniest-imagenet-200"
    attribute_names = df.columns
    description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. !!! This dataset only links to 20 images per class (instead of the usual 500) and is ONLY for quickly testing a framework. !!! Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
    paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
    citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")

    tinyim = create_dataset(
        name = name,
        description = description,
        creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        collection_date = "2017",
        language= "English",
        licence="DbCL v1.0",
        default_target_attribute="label",
        attributes="auto",
        data=df,
        citation=citation,
        ignore_attribute=None
    )
    openml.config.apikey = ''
    tinyim.publish()
    print(f"URL for dataset: {tinyim.openml_url}")
def create_tiniest_imagenet():
    dir_name = "datasets"
    os.makedirs(dir_name, exist_ok=True)

    # download the dataset
    url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
    r = requests.get(url, stream=True)

    if not os.path.exists(f"{dir_name}/tiny-imagenet-200.zip"):
        with open(f"{dir_name}/tiny-imagenet-200.zip", "wb") as f:
            f.write(r.content)

        with zipfile.ZipFile(f"{dir_name}/tiny-imagenet-200.zip", 'r') as zip_ref:
            zip_ref.extractall(f"{dir_name}/")
    ## recusively find all the images
    image_paths = glob.glob(f"{dir_name}/tiny-imagenet-200/train/*/*/*.JPEG")
    ## remove the first part of the path
    image_paths = [path.split("/", 1)[-1] for path in image_paths]
    image_paths[-1]
    ## create a dataframe with the image path and the label
    label_func = lambda x: x.split("/")[2]
    df = pd.DataFrame(image_paths, columns=["image_path"])
    df["label"] = df["image_path"].apply(label_func)
    ## encode types
    df["image_path"] = df["image_path"].astype("string")
    df["label"] = df["label"].astype("string")

    # keep only first 20 images for each label
    df = df.groupby("label").head(20)


    name = "tiniest-imagenet-200"
    attribute_names = df.columns
    description = "Tiny ImageNet contains 100000 images of 200 classes (500 for each class) downsized to 64 x 64 colored images. !!! This dataset only links to 20 images per class (instead of the usual 500) and is ONLY for quickly testing a framework. !!! Each class has 500 training images, 50 validation images, and 50 test images. The dataset here just contains links to the images and the labels. The dataset can be downloaded from the official website ![here](http://cs231n.stanford.edu/tiny-imagenet-200.zip). /n Link to the paper - [Tiny ImageNet Classification with CNN](https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf)"
    paper_url = "https://cs231n.stanford.edu/reports/2017/pdfs/930.pdf"
    citation = ("Wu, J., Zhang, Q., & Xu, G. (2017). Tiny imagenet challenge. Technical report.")

    tinyim = create_dataset(
        name = name,
        description = description,
        creator= "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        contributor = "Jiayu Wu, Qixiang Zhang, Guoxi Xu",
        collection_date = "2017",
        language= "English",
        licence="DbCL v1.0",
        default_target_attribute="label",
        attributes="auto",
        data=df,
        citation=citation,
        ignore_attribute=None
    )
    openml.config.apikey = ''
    tinyim.publish()
    print(f"URL for dataset: {tinyim.openml_url}")

In [20]:

Copied!

create_tiniest_imagenet()
# https://www.openml.org/d/46347
create_tiniest_imagenet()
# https://www.openml.org/d/46347

URL for dataset: https://www.openml.org/d/46347

Create task on OpenML¶

Now to actually use the OpenML Pytorch API, we need to have a task associated with the dataset. This is how we create it.

In [27]:

Copied!





def create_task():
    # Define task parameters
    task_type = openml.tasks.TaskType.SUPERVISED_CLASSIFICATION
    dataset_id = 46347 # Obtained from the dataset creation step
    evaluation_measure = 'predictive_accuracy'
    target_name = 'label'
    class_labels = list(pd.read_csv("datasets/tiniest_imagenet.csv")["label"].unique())
    cost_matrix = None

    # Create the task
    new_task = openml.tasks.create_task(
        task_type=task_type,
        dataset_id=dataset_id, 
        estimation_procedure_id = 1,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        class_labels=class_labels,
        cost_matrix=cost_matrix
    )
    openml.config.apikey = ''
    new_task.publish()
    print(f"URL for task: {new_task.openml_url}")
def create_task():
    # Define task parameters
    task_type = openml.tasks.TaskType.SUPERVISED_CLASSIFICATION
    dataset_id = 46347 # Obtained from the dataset creation step
    evaluation_measure = 'predictive_accuracy'
    target_name = 'label'
    class_labels = list(pd.read_csv("datasets/tiniest_imagenet.csv")["label"].unique())
    cost_matrix = None

    # Create the task
    new_task = openml.tasks.create_task(
        task_type=task_type,
        dataset_id=dataset_id, 
        estimation_procedure_id = 1,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        class_labels=class_labels,
        cost_matrix=cost_matrix
    )
    openml.config.apikey = ''
    new_task.publish()
    print(f"URL for task: {new_task.openml_url}")

In [28]:

Copied!

create_task()
# https://www.openml.org/t/362128
create_task()
# https://www.openml.org/t/362128

URL for task: https://www.openml.org/t/362128