Skip to content

functions

__list_tasks(api_call, output_format='dict')

Returns a dictionary or a Pandas DataFrame with information about OpenML tasks.

Parameters:

Name Type Description Default
api_call str

The API call specifying which tasks to return.

required
output_format str in {'dict', 'dataframe'}

Output format for the returned object.

'dict'

Returns:

Type Description
Union[Dict, DataFrame]

A dictionary or a Pandas DataFrame with information about OpenML tasks.

Raises:

Type Description
ValueError

If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', or has an incorrect value for '@xmlns:oml'.

KeyError

If an invalid key is found in the XML for a task.

Source code in openml/tasks/functions.py
def __list_tasks(  # noqa: PLR0912, C901
    api_call: str,
    output_format: Literal["dict", "dataframe"] = "dict",
) -> dict | pd.DataFrame:
    """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks.

    Parameters
    ----------
    api_call : str
        The API call specifying which tasks to return.
    output_format : str in {"dict", "dataframe"}
        Output format for the returned object.

    Returns
    -------
    Union[Dict, pd.DataFrame]
        A dictionary or a Pandas DataFrame with information about OpenML tasks.

    Raises
    ------
    ValueError
        If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
        or has an incorrect value for '@xmlns:oml'.
    KeyError
        If an invalid key is found in the XML for a task.
    """
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
    # Minimalistic check if the XML is useful
    if "oml:tasks" not in tasks_dict:
        raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')

    if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
        raise ValueError(
            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
        )

    if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
        raise ValueError(
            "Error in return XML, value of  "
            '"oml:runs"/@xmlns:oml is not '
            '"http://openml.org/openml": %s' % str(tasks_dict),
        )

    assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])

    tasks = {}
    procs = _get_estimation_procedure_list()
    proc_dict = {x["id"]: x for x in procs}

    for task_ in tasks_dict["oml:tasks"]["oml:task"]:
        tid = None
        try:
            tid = int(task_["oml:task_id"])
            task_type_int = int(task_["oml:task_type_id"])
            try:
                task_type_id = TaskType(task_type_int)
            except ValueError as e:
                warnings.warn(
                    f"Could not create task type id for {task_type_int} due to error {e}",
                    RuntimeWarning,
                    stacklevel=2,
                )
                continue

            task = {
                "tid": tid,
                "ttid": task_type_id,
                "did": int(task_["oml:did"]),
                "name": task_["oml:name"],
                "task_type": task_["oml:task_type"],
                "status": task_["oml:status"],
            }

            # Other task inputs
            for _input in task_.get("oml:input", []):
                if _input["@name"] == "estimation_procedure":
                    task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
                else:
                    value = _input.get("#text")
                    task[_input["@name"]] = value

            # The number of qualities can range from 0 to infinity
            for quality in task_.get("oml:quality", []):
                if "#text" not in quality:
                    quality_value = 0.0
                else:
                    quality["#text"] = float(quality["#text"])
                    if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
                        quality["#text"] = int(quality["#text"])
                    quality_value = quality["#text"]
                task[quality["@name"]] = quality_value
            tasks[tid] = task
        except KeyError as e:
            if tid is not None:
                warnings.warn(
                    "Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_),
                    RuntimeWarning,
                    stacklevel=2,
                )
            else:
                warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)

    if output_format == "dataframe":
        tasks = pd.DataFrame.from_dict(tasks, orient="index")

    return tasks

create_task(task_type, dataset_id, estimation_procedure_id, target_name=None, evaluation_measure=None, **kwargs)

Create a task based on different given attributes.

Builds a task object with the function arguments as attributes. The type of the task object built is determined from the task type id. More information on how the arguments (task attributes), relate to the different possible tasks can be found in the individual task objects at the openml.tasks.task module.

Parameters:

Name Type Description Default
task_type TaskType

Id of the task type.

required
dataset_id int

The id of the dataset for the task.

required
target_name str

The name of the feature used as a target. At the moment, only optional for the clustering tasks.

None
estimation_procedure_id int

The id of the estimation procedure.

required
evaluation_measure str

The name of the evaluation measure.

None
kwargs dict

Other task attributes that are not mandatory for task upload.

{}

Returns:

Type Description
(OpenMLClassificationTask, OpenMLRegressionTask)
(OpenMLLearningCurveTask, OpenMLClusteringTask)
Source code in openml/tasks/functions.py
def create_task(
    task_type: TaskType,
    dataset_id: int,
    estimation_procedure_id: int,
    target_name: str | None = None,
    evaluation_measure: str | None = None,
    **kwargs: Any,
) -> (
    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
):
    """Create a task based on different given attributes.

    Builds a task object with the function arguments as
    attributes. The type of the task object built is
    determined from the task type id.
    More information on how the arguments (task attributes),
    relate to the different possible tasks can be found in
    the individual task objects at the openml.tasks.task
    module.

    Parameters
    ----------
    task_type : TaskType
        Id of the task type.
    dataset_id : int
        The id of the dataset for the task.
    target_name : str, optional
        The name of the feature used as a target.
        At the moment, only optional for the clustering tasks.
    estimation_procedure_id : int
        The id of the estimation procedure.
    evaluation_measure : str, optional
        The name of the evaluation measure.
    kwargs : dict, optional
        Other task attributes that are not mandatory
        for task upload.

    Returns
    -------
    OpenMLClassificationTask, OpenMLRegressionTask,
    OpenMLLearningCurveTask, OpenMLClusteringTask
    """
    if task_type == TaskType.CLUSTERING:
        task_cls = OpenMLClusteringTask
    elif task_type == TaskType.LEARNING_CURVE:
        task_cls = OpenMLLearningCurveTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
        task_cls = OpenMLClassificationTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_REGRESSION:
        task_cls = OpenMLRegressionTask  # type: ignore
    else:
        raise NotImplementedError(f"Task type {task_type:d} not supported.")

    return task_cls(
        task_type_id=task_type,
        task_type="None",  # TODO: refactor to get task type string from ID.
        data_set_id=dataset_id,
        target_name=target_name,
        estimation_procedure_id=estimation_procedure_id,
        evaluation_measure=evaluation_measure,
        **kwargs,
    )

delete_task(task_id)

Delete task with id task_id from the OpenML server.

You can only delete tasks which you created and have no runs associated with them.

Parameters:

Name Type Description Default
task_id int

OpenML id of the task

required

Returns:

Type Description
bool

True if the deletion was successful. False otherwise.

Source code in openml/tasks/functions.py
def delete_task(task_id: int) -> bool:
    """Delete task with id `task_id` from the OpenML server.

    You can only delete tasks which you created and have
    no runs associated with them.

    Parameters
    ----------
    task_id : int
        OpenML id of the task

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("task", task_id)

get_task(task_id, *dataset_args, download_splits=None, **get_dataset_kwargs)

Download OpenML task for a given task ID.

Downloads the task representation. By default, this will also download the data splits and the dataset. From version 0.15.0 onwards, the splits nor the dataset will not be downloaded by default.

Use the download_splits parameter to control whether the splits are downloaded. Moreover, you may pass additional parameter (args or kwargs) that are passed to :meth:openml.datasets.get_dataset. For backwards compatibility, if download_data is passed as an additional parameter and download_splits is not explicitly set, download_data also overrules download_splits's value (deprecated from Version 0.15.0 onwards).

Parameters:

Name Type Description Default
task_id int

The OpenML task id of the task to download.

required
download_splits bool | None

Whether to download the splits as well. From version 0.15.0 onwards this is independent of download_data and will default to False.

None
dataset_args Any

Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset. This includes download_data. If set to True the splits are downloaded as well (deprecated from Version 0.15.0 onwards). The args are only present for backwards compatibility and will be removed from version 0.15.0 onwards.

()
get_dataset_kwargs Any

Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset. This includes download_data. If set to True the splits are downloaded as well (deprecated from Version 0.15.0 onwards). The args are only present for backwards compatibility and will be removed from version 0.15.0 onwards.

()

Returns:

Name Type Description
task OpenMLTask
Source code in openml/tasks/functions.py
@openml.utils.thread_safe_if_oslo_installed
def get_task(
    task_id: int,
    *dataset_args: Any,
    download_splits: bool | None = None,
    **get_dataset_kwargs: Any,
) -> OpenMLTask:
    """Download OpenML task for a given task ID.

    Downloads the task representation. By default, this will also download the data splits and
    the dataset. From version 0.15.0 onwards, the splits nor the dataset will not be downloaded by
    default.

    Use the `download_splits` parameter to control whether the splits are downloaded.
    Moreover, you may pass additional parameter (args or kwargs) that are passed to
    :meth:`openml.datasets.get_dataset`.
    For backwards compatibility, if `download_data` is passed as an additional parameter and
    `download_splits` is not explicitly set, `download_data` also overrules `download_splits`'s
    value (deprecated from Version 0.15.0 onwards).

    Parameters
    ----------
    task_id : int
        The OpenML task id of the task to download.
    download_splits: bool (default=True)
        Whether to download the splits as well. From version 0.15.0 onwards this is independent
        of download_data and will default to ``False``.
    dataset_args, get_dataset_kwargs :
        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.
        This includes `download_data`. If set to True the splits are downloaded as well
        (deprecated from Version 0.15.0 onwards). The args are only present for backwards
        compatibility and will be removed from version 0.15.0 onwards.

    Returns
    -------
    task: OpenMLTask
    """
    if download_splits is None:
        # TODO(0.15): Switch download splits to False by default, adjust typing above, adjust
        #  documentation above, and remove warning.
        warnings.warn(
            "Starting from Version 0.15.0 `download_splits` will default to ``False`` instead "
            "of ``True`` and be independent from `download_data`. To disable this message until "
            "version 0.15 explicitly set `download_splits` to a bool.",
            FutureWarning,
            stacklevel=3,
        )
        download_splits = get_dataset_kwargs.get("download_data", True)

    if not isinstance(task_id, int):
        # TODO(0.15): Remove warning
        warnings.warn(
            "Task id must be specified as `int` from 0.14.0 onwards.",
            FutureWarning,
            stacklevel=3,
        )

    try:
        task_id = int(task_id)
    except (ValueError, TypeError) as e:
        raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") from e

    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)

    try:
        task = _get_task_description(task_id)
        dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
        # List of class labels available in dataset description
        # Including class labels as part of task meta data handles
        #   the case where data download was initially disabled
        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
            task.class_labels = dataset.retrieve_class_labels(task.target_name)
        # Clustering tasks do not have class labels
        # and do not offer download_split
        if download_splits and isinstance(task, OpenMLSupervisedTask):
            task.download_split()
    except Exception as e:
        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
        raise e

    return task

get_tasks(task_ids, download_data=True, download_qualities=True)

Download tasks.

This function iterates :meth:openml.tasks.get_task.

Parameters:

Name Type Description Default
task_ids List[int]

A list of task ids to download.

required
download_data bool(default=True)

Option to trigger download of data along with the meta data.

True
download_qualities bool(default=True)

Option to download 'qualities' meta-data in addition to the minimal dataset description.

True

Returns:

Type Description
list
Source code in openml/tasks/functions.py
def get_tasks(
    task_ids: list[int],
    download_data: bool = True,  # noqa: FBT001, FBT002
    download_qualities: bool = True,  # noqa: FBT001, FBT002
) -> list[OpenMLTask]:
    """Download tasks.

    This function iterates :meth:`openml.tasks.get_task`.

    Parameters
    ----------
    task_ids : List[int]
        A list of task ids to download.
    download_data : bool (default = True)
        Option to trigger download of data along with the meta data.
    download_qualities : bool (default=True)
        Option to download 'qualities' meta-data in addition to the minimal dataset description.

    Returns
    -------
    list
    """
    tasks = []
    for task_id in task_ids:
        tasks.append(get_task(task_id, download_data, download_qualities))
    return tasks

list_tasks(task_type=None, offset=None, size=None, tag=None, output_format='dict', **kwargs)

Return a number of tasks having the given tag and task_type

Parameters:

Name Type Description Default
Filter
required
it
required
type
required
task_type TaskType

Refers to the type of task.

None
offset int

the number of tasks to skip, starting from the first

None
size int

the maximum number of tasks to show

None
tag str

the tag to include

None
output_format Literal['dict', 'dataframe']

The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame

'dict'
kwargs Any

Legal filter operators: data_tag, status, data_id, data_name, number_instances, number_features, number_classes, number_missing_values.

{}

Returns:

Type Description
dict

All tasks having the given task_type and the give tag. Every task is represented by a dictionary containing the following information: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

dataframe

All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

Source code in openml/tasks/functions.py
def list_tasks(
    task_type: TaskType | None = None,
    offset: int | None = None,
    size: int | None = None,
    tag: str | None = None,
    output_format: Literal["dict", "dataframe"] = "dict",
    **kwargs: Any,
) -> dict | pd.DataFrame:
    """
    Return a number of tasks having the given tag and task_type

    Parameters
    ----------
    Filter task_type is separated from the other filters because
    it is used as task_type in the task description, but it is named
    type when used as a filter in list tasks call.
    task_type : TaskType, optional
        Refers to the type of task.
    offset : int, optional
        the number of tasks to skip, starting from the first
    size : int, optional
        the maximum number of tasks to show
    tag : str, optional
        the tag to include
    output_format: str, optional (default='dict')
        The parameter decides the format of the output.
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame
    kwargs: dict, optional
        Legal filter operators: data_tag, status, data_id, data_name,
        number_instances, number_features,
        number_classes, number_missing_values.

    Returns
    -------
    dict
        All tasks having the given task_type and the give tag. Every task is
        represented by a dictionary containing the following information:
        task id, dataset id, task_type and status. If qualities are calculated
        for the associated dataset, some of these are also returned.
    dataframe
        All tasks having the given task_type and the give tag. Every task is
        represented by a row in the data frame containing the following information
        as columns: task id, dataset id, task_type and status. If qualities are
        calculated for the associated dataset, some of these are also returned.
    """
    if output_format not in ["dataframe", "dict"]:
        raise ValueError(
            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
        )
    # TODO: [0.15]
    if output_format == "dict":
        msg = (
            "Support for `output_format` of 'dict' will be removed in 0.15 "
            "and pandas dataframes will be returned instead. To ensure your code "
            "will continue to work, use `output_format`='dataframe'."
        )
        warnings.warn(msg, category=FutureWarning, stacklevel=2)
    return openml.utils._list_all(  # type: ignore
        list_output_format=output_format,  # type: ignore
        listing_call=_list_tasks,
        task_type=task_type,
        offset=offset,
        size=size,
        tag=tag,
        **kwargs,
    )