Skip to content

tasks

OpenMLClassificationTask

Bases: OpenMLSupervisedTask

OpenML Classification object.

Parameters:

Name Type Description Default
task_type_id TaskType

ID of the Classification task type.

required
task_type str

Name of the Classification task type.

required
data_set_id int

ID of the OpenML dataset associated with the Classification task.

required
target_name str

Name of the target variable.

required
estimation_procedure_id int

ID of the estimation procedure for the Classification task.

None
estimation_procedure_type str

Type of the estimation procedure.

None
estimation_parameters dict

Estimation parameters for the Classification task.

None
evaluation_measure str

Name of the evaluation measure.

None
data_splits_url str

URL of the data splits for the Classification task.

None
task_id Union[int, None]

ID of the Classification task (if it already exists on OpenML).

None
class_labels List of str

A list of class labels (for classification tasks).

None
cost_matrix array

A cost matrix (for classification tasks).

None
Source code in openml/tasks/task.py
class OpenMLClassificationTask(OpenMLSupervisedTask):
    """OpenML Classification object.

    Parameters
    ----------
    task_type_id : TaskType
        ID of the Classification task type.
    task_type : str
        Name of the Classification task type.
    data_set_id : int
        ID of the OpenML dataset associated with the Classification task.
    target_name : str
        Name of the target variable.
    estimation_procedure_id : int, default=None
        ID of the estimation procedure for the Classification task.
    estimation_procedure_type : str, default=None
        Type of the estimation procedure.
    estimation_parameters : dict, default=None
        Estimation parameters for the Classification task.
    evaluation_measure : str, default=None
        Name of the evaluation measure.
    data_splits_url : str, default=None
        URL of the data splits for the Classification task.
    task_id : Union[int, None]
        ID of the Classification task (if it already exists on OpenML).
    class_labels : List of str, default=None
        A list of class labels (for classification tasks).
    cost_matrix : array, default=None
        A cost matrix (for classification tasks).
    """

    def __init__(  # noqa: PLR0913
        self,
        task_type_id: TaskType,
        task_type: str,
        data_set_id: int,
        target_name: str,
        estimation_procedure_id: int = 1,
        estimation_procedure_type: str | None = None,
        estimation_parameters: dict[str, str] | None = None,
        evaluation_measure: str | None = None,
        data_splits_url: str | None = None,
        task_id: int | None = None,
        class_labels: list[str] | None = None,
        cost_matrix: np.ndarray | None = None,
    ):
        super().__init__(
            task_id=task_id,
            task_type_id=task_type_id,
            task_type=task_type,
            data_set_id=data_set_id,
            estimation_procedure_id=estimation_procedure_id,
            estimation_procedure_type=estimation_procedure_type,
            estimation_parameters=estimation_parameters,
            evaluation_measure=evaluation_measure,
            target_name=target_name,
            data_splits_url=data_splits_url,
        )
        self.class_labels = class_labels
        self.cost_matrix = cost_matrix

        if cost_matrix is not None:
            raise NotImplementedError("Costmatrix")

OpenMLClusteringTask

Bases: OpenMLTask

OpenML Clustering object.

Parameters:

Name Type Description Default
task_type_id TaskType

Task type ID of the OpenML clustering task.

required
task_type str

Task type of the OpenML clustering task.

required
data_set_id int

ID of the OpenML dataset used in clustering the task.

required
estimation_procedure_id int

ID of the OpenML estimation procedure.

None
task_id Union[int, None]

ID of the OpenML clustering task.

None
estimation_procedure_type str

Type of the OpenML estimation procedure used in the clustering task.

None
estimation_parameters dict

Parameters used by the OpenML estimation procedure.

None
data_splits_url str

URL of the OpenML data splits for the clustering task.

None
evaluation_measure str

Evaluation measure used in the clustering task.

None
target_name str

Name of the target feature (class) that is not part of the feature set for the clustering task.

None
Source code in openml/tasks/task.py
class OpenMLClusteringTask(OpenMLTask):
    """OpenML Clustering object.

    Parameters
    ----------
    task_type_id : TaskType
        Task type ID of the OpenML clustering task.
    task_type : str
        Task type of the OpenML clustering task.
    data_set_id : int
        ID of the OpenML dataset used in clustering the task.
    estimation_procedure_id : int, default=None
        ID of the OpenML estimation procedure.
    task_id : Union[int, None]
        ID of the OpenML clustering task.
    estimation_procedure_type : str, default=None
        Type of the OpenML estimation procedure used in the clustering task.
    estimation_parameters : dict, default=None
        Parameters used by the OpenML estimation procedure.
    data_splits_url : str, default=None
        URL of the OpenML data splits for the clustering task.
    evaluation_measure : str, default=None
        Evaluation measure used in the clustering task.
    target_name : str, default=None
        Name of the target feature (class) that is not part of the
        feature set for the clustering task.
    """

    def __init__(  # noqa: PLR0913
        self,
        task_type_id: TaskType,
        task_type: str,
        data_set_id: int,
        estimation_procedure_id: int = 17,
        task_id: int | None = None,
        estimation_procedure_type: str | None = None,
        estimation_parameters: dict[str, str] | None = None,
        data_splits_url: str | None = None,
        evaluation_measure: str | None = None,
        target_name: str | None = None,
    ):
        super().__init__(
            task_id=task_id,
            task_type_id=task_type_id,
            task_type=task_type,
            data_set_id=data_set_id,
            evaluation_measure=evaluation_measure,
            estimation_procedure_id=estimation_procedure_id,
            estimation_procedure_type=estimation_procedure_type,
            estimation_parameters=estimation_parameters,
            data_splits_url=data_splits_url,
        )

        self.target_name = target_name

    @overload
    def get_X(
        self,
        dataset_format: Literal["array"] = "array",
    ) -> np.ndarray | scipy.sparse.spmatrix:
        ...

    @overload
    def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame:
        ...

    def get_X(
        self,
        dataset_format: Literal["array", "dataframe"] = "array",
    ) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix:
        """Get data associated with the current task.

        Parameters
        ----------
        dataset_format : str
            Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
            for possible options.

        Returns
        -------
        tuple - X and y

        """
        dataset = self.get_dataset()
        data, *_ = dataset.get_data(dataset_format=dataset_format, target=None)
        return data

    def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
        # Right now, it is not supported as a feature.
        # Uncomment if it is supported on the server
        # in the future.
        # https://github.com/openml/OpenML/issues/925
        """
        task_dict = task_container['oml:task_inputs']
        if self.target_name is not None:
            task_dict['oml:input'].append(
                OrderedDict([
                    ('@name', 'target_feature'),
                    ('#text', self.target_name)
                ])
            )
        """
        return super()._to_dict()

get_X(dataset_format='array')

get_X(dataset_format: Literal['array'] = 'array') -> np.ndarray | scipy.sparse.spmatrix
get_X(dataset_format: Literal['dataframe']) -> pd.DataFrame

Get data associated with the current task.

Parameters:

Name Type Description Default
dataset_format str

Data structure of the returned data. See :meth:openml.datasets.OpenMLDataset.get_data for possible options.

'array'

Returns:

Type Description
tuple - X and y
Source code in openml/tasks/task.py
def get_X(
    self,
    dataset_format: Literal["array", "dataframe"] = "array",
) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix:
    """Get data associated with the current task.

    Parameters
    ----------
    dataset_format : str
        Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
        for possible options.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    data, *_ = dataset.get_data(dataset_format=dataset_format, target=None)
    return data

OpenMLLearningCurveTask

Bases: OpenMLClassificationTask

OpenML Learning Curve object.

Parameters:

Name Type Description Default
task_type_id TaskType

ID of the Learning Curve task.

required
task_type str

Name of the Learning Curve task.

required
data_set_id int

ID of the dataset that this task is associated with.

required
target_name str

Name of the target feature in the dataset.

required
estimation_procedure_id int

ID of the estimation procedure to use for evaluating models.

None
estimation_procedure_type str

Type of the estimation procedure.

None
estimation_parameters dict

Additional parameters for the estimation procedure.

None
data_splits_url str

URL of the file containing the data splits for Learning Curve task.

None
task_id Union[int, None]

ID of the Learning Curve task.

None
evaluation_measure str

Name of the evaluation measure to use for evaluating models.

None
class_labels list of str

Class labels for Learning Curve tasks.

None
cost_matrix numpy array

Cost matrix for Learning Curve tasks.

None
Source code in openml/tasks/task.py
class OpenMLLearningCurveTask(OpenMLClassificationTask):
    """OpenML Learning Curve object.

    Parameters
    ----------
    task_type_id : TaskType
        ID of the Learning Curve task.
    task_type : str
        Name of the Learning Curve task.
    data_set_id : int
        ID of the dataset that this task is associated with.
    target_name : str
        Name of the target feature in the dataset.
    estimation_procedure_id : int, default=None
        ID of the estimation procedure to use for evaluating models.
    estimation_procedure_type : str, default=None
        Type of the estimation procedure.
    estimation_parameters : dict, default=None
        Additional parameters for the estimation procedure.
    data_splits_url : str, default=None
        URL of the file containing the data splits for Learning Curve task.
    task_id : Union[int, None]
        ID of the Learning Curve task.
    evaluation_measure : str, default=None
        Name of the evaluation measure to use for evaluating models.
    class_labels : list of str, default=None
        Class labels for Learning Curve tasks.
    cost_matrix : numpy array, default=None
        Cost matrix for Learning Curve tasks.
    """

    def __init__(  # noqa: PLR0913
        self,
        task_type_id: TaskType,
        task_type: str,
        data_set_id: int,
        target_name: str,
        estimation_procedure_id: int = 13,
        estimation_procedure_type: str | None = None,
        estimation_parameters: dict[str, str] | None = None,
        data_splits_url: str | None = None,
        task_id: int | None = None,
        evaluation_measure: str | None = None,
        class_labels: list[str] | None = None,
        cost_matrix: np.ndarray | None = None,
    ):
        super().__init__(
            task_id=task_id,
            task_type_id=task_type_id,
            task_type=task_type,
            data_set_id=data_set_id,
            estimation_procedure_id=estimation_procedure_id,
            estimation_procedure_type=estimation_procedure_type,
            estimation_parameters=estimation_parameters,
            evaluation_measure=evaluation_measure,
            target_name=target_name,
            data_splits_url=data_splits_url,
            class_labels=class_labels,
            cost_matrix=cost_matrix,
        )

OpenMLRegressionTask

Bases: OpenMLSupervisedTask

OpenML Regression object.

Parameters:

Name Type Description Default
task_type_id TaskType

Task type ID of the OpenML Regression task.

required
task_type str

Task type of the OpenML Regression task.

required
data_set_id int

ID of the OpenML dataset.

required
target_name str

Name of the target feature used in the Regression task.

required
estimation_procedure_id int

ID of the OpenML estimation procedure.

None
estimation_procedure_type str

Type of the OpenML estimation procedure.

None
estimation_parameters dict

Parameters used by the OpenML estimation procedure.

None
data_splits_url str

URL of the OpenML data splits for the Regression task.

None
task_id Union[int, None]

ID of the OpenML Regression task.

None
evaluation_measure str

Evaluation measure used in the Regression task.

None
Source code in openml/tasks/task.py
class OpenMLRegressionTask(OpenMLSupervisedTask):
    """OpenML Regression object.

    Parameters
    ----------
    task_type_id : TaskType
        Task type ID of the OpenML Regression task.
    task_type : str
        Task type of the OpenML Regression task.
    data_set_id : int
        ID of the OpenML dataset.
    target_name : str
        Name of the target feature used in the Regression task.
    estimation_procedure_id : int, default=None
        ID of the OpenML estimation procedure.
    estimation_procedure_type : str, default=None
        Type of the OpenML estimation procedure.
    estimation_parameters : dict, default=None
        Parameters used by the OpenML estimation procedure.
    data_splits_url : str, default=None
        URL of the OpenML data splits for the Regression task.
    task_id : Union[int, None]
        ID of the OpenML Regression task.
    evaluation_measure : str, default=None
        Evaluation measure used in the Regression task.
    """

    def __init__(  # noqa: PLR0913
        self,
        task_type_id: TaskType,
        task_type: str,
        data_set_id: int,
        target_name: str,
        estimation_procedure_id: int = 7,
        estimation_procedure_type: str | None = None,
        estimation_parameters: dict[str, str] | None = None,
        data_splits_url: str | None = None,
        task_id: int | None = None,
        evaluation_measure: str | None = None,
    ):
        super().__init__(
            task_id=task_id,
            task_type_id=task_type_id,
            task_type=task_type,
            data_set_id=data_set_id,
            estimation_procedure_id=estimation_procedure_id,
            estimation_procedure_type=estimation_procedure_type,
            estimation_parameters=estimation_parameters,
            evaluation_measure=evaluation_measure,
            target_name=target_name,
            data_splits_url=data_splits_url,
        )

OpenMLSplit

OpenML Split object.

Parameters:

Name Type Description Default
name int or str
required
description str
required
split dict
required
Source code in openml/tasks/split.py
class OpenMLSplit:
    """OpenML Split object.

    Parameters
    ----------
    name : int or str
    description : str
    split : dict
    """

    def __init__(
        self,
        name: int | str,
        description: str,
        split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]],
    ):
        self.description = description
        self.name = name
        self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {}

        # Add splits according to repetition
        for repetition in split:
            _rep = int(repetition)
            self.split[_rep] = OrderedDict()
            for fold in split[_rep]:
                self.split[_rep][fold] = OrderedDict()
                for sample in split[_rep][fold]:
                    self.split[_rep][fold][sample] = split[_rep][fold][sample]

        self.repeats = len(self.split)

        # TODO(eddiebergman): Better error message
        if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)):
            raise ValueError("")

        self.folds = len(self.split[0])
        self.samples = len(self.split[0][0])

    def __eq__(self, other: Any) -> bool:
        if (
            (not isinstance(self, type(other)))
            or self.name != other.name
            or self.description != other.description
            or self.split.keys() != other.split.keys()
            or any(
                self.split[repetition].keys() != other.split[repetition].keys()
                for repetition in self.split
            )
        ):
            return False

        samples = [
            (repetition, fold, sample)
            for repetition in self.split
            for fold in self.split[repetition]
            for sample in self.split[repetition][fold]
        ]

        for repetition, fold, sample in samples:
            self_train, self_test = self.split[repetition][fold][sample]
            other_train, other_test = other.split[repetition][fold][sample]
            if not (np.all(self_train == other_train) and np.all(self_test == other_test)):
                return False
        return True

    @classmethod
    def _from_arff_file(cls, filename: Path) -> OpenMLSplit:  # noqa: C901, PLR0912
        repetitions = None
        name = None

        pkl_filename = filename.with_suffix(".pkl.py3")

        if pkl_filename.exists():
            with pkl_filename.open("rb") as fh:
                # TODO(eddiebergman): Would be good to figure out what _split is and assert it is
                _split = pickle.load(fh)  # noqa: S301
            repetitions = _split["repetitions"]
            name = _split["name"]

        # Cache miss
        if repetitions is None:
            # Faster than liac-arff and sufficient in this situation!
            if not filename.exists():
                raise FileNotFoundError(f"Split arff {filename} does not exist!")

            file_data = arff.load(filename.open("r"), return_type=arff.DENSE_GEN)
            splits = file_data["data"]
            name = file_data["relation"]
            attrnames = [attr[0] for attr in file_data["attributes"]]

            repetitions = OrderedDict()

            type_idx = attrnames.index("type")
            rowid_idx = attrnames.index("rowid")
            repeat_idx = attrnames.index("repeat")
            fold_idx = attrnames.index("fold")
            sample_idx = attrnames.index("sample") if "sample" in attrnames else None

            for line in splits:
                # A line looks like type, rowid, repeat, fold
                repetition = int(line[repeat_idx])
                fold = int(line[fold_idx])
                sample = 0
                if sample_idx is not None:
                    sample = int(line[sample_idx])

                if repetition not in repetitions:
                    repetitions[repetition] = OrderedDict()
                if fold not in repetitions[repetition]:
                    repetitions[repetition][fold] = OrderedDict()
                if sample not in repetitions[repetition][fold]:
                    repetitions[repetition][fold][sample] = ([], [])
                split = repetitions[repetition][fold][sample]

                type_ = line[type_idx]
                if type_ == "TRAIN":
                    split[0].append(line[rowid_idx])
                elif type_ == "TEST":
                    split[1].append(line[rowid_idx])
                else:
                    raise ValueError(type_)

            for repetition in repetitions:
                for fold in repetitions[repetition]:
                    for sample in repetitions[repetition][fold]:
                        repetitions[repetition][fold][sample] = Split(
                            np.array(repetitions[repetition][fold][sample][0], dtype=np.int32),
                            np.array(repetitions[repetition][fold][sample][1], dtype=np.int32),
                        )

            with pkl_filename.open("wb") as fh:
                pickle.dump({"name": name, "repetitions": repetitions}, fh, protocol=2)

        assert name is not None
        return cls(name, "", repetitions)

    def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
        """Returns the specified data split from the CrossValidationSplit object.

        Parameters
        ----------
        repeat : int
            Index of the repeat to retrieve.
        fold : int
            Index of the fold to retrieve.
        sample : int
            Index of the sample to retrieve.

        Returns
        -------
        numpy.ndarray
            The data split for the specified repeat, fold, and sample.

        Raises
        ------
        ValueError
            If the specified repeat, fold, or sample is not known.
        """
        if repeat not in self.split:
            raise ValueError("Repeat %s not known" % str(repeat))
        if fold not in self.split[repeat]:
            raise ValueError("Fold %s not known" % str(fold))
        if sample not in self.split[repeat][fold]:
            raise ValueError("Sample %s not known" % str(sample))
        return self.split[repeat][fold][sample]

get(repeat=0, fold=0, sample=0)

Returns the specified data split from the CrossValidationSplit object.

Parameters:

Name Type Description Default
repeat int

Index of the repeat to retrieve.

0
fold int

Index of the fold to retrieve.

0
sample int

Index of the sample to retrieve.

0

Returns:

Type Description
ndarray

The data split for the specified repeat, fold, and sample.

Raises:

Type Description
ValueError

If the specified repeat, fold, or sample is not known.

Source code in openml/tasks/split.py
def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
    """Returns the specified data split from the CrossValidationSplit object.

    Parameters
    ----------
    repeat : int
        Index of the repeat to retrieve.
    fold : int
        Index of the fold to retrieve.
    sample : int
        Index of the sample to retrieve.

    Returns
    -------
    numpy.ndarray
        The data split for the specified repeat, fold, and sample.

    Raises
    ------
    ValueError
        If the specified repeat, fold, or sample is not known.
    """
    if repeat not in self.split:
        raise ValueError("Repeat %s not known" % str(repeat))
    if fold not in self.split[repeat]:
        raise ValueError("Fold %s not known" % str(fold))
    if sample not in self.split[repeat][fold]:
        raise ValueError("Sample %s not known" % str(sample))
    return self.split[repeat][fold][sample]

OpenMLSupervisedTask

Bases: OpenMLTask, ABC

OpenML Supervised Classification object.

Parameters:

Name Type Description Default
task_type_id TaskType

ID of the task type.

required
task_type str

Name of the task type.

required
data_set_id int

ID of the OpenML dataset associated with the task.

required
target_name str

Name of the target feature (the class variable).

required
estimation_procedure_id int

ID of the estimation procedure for the task.

None
estimation_procedure_type str

Type of the estimation procedure for the task.

None
estimation_parameters dict

Estimation parameters for the task.

None
evaluation_measure str

Name of the evaluation measure for the task.

None
data_splits_url str

URL of the data splits for the task.

None
task_id int | None

Refers to the unique identifier of task.

None
Source code in openml/tasks/task.py
class OpenMLSupervisedTask(OpenMLTask, ABC):
    """OpenML Supervised Classification object.

    Parameters
    ----------
    task_type_id : TaskType
        ID of the task type.
    task_type : str
        Name of the task type.
    data_set_id : int
        ID of the OpenML dataset associated with the task.
    target_name : str
        Name of the target feature (the class variable).
    estimation_procedure_id : int, default=None
        ID of the estimation procedure for the task.
    estimation_procedure_type : str, default=None
        Type of the estimation procedure for the task.
    estimation_parameters : dict, default=None
        Estimation parameters for the task.
    evaluation_measure : str, default=None
        Name of the evaluation measure for the task.
    data_splits_url : str, default=None
        URL of the data splits for the task.
    task_id: Union[int, None]
        Refers to the unique identifier of task.
    """

    def __init__(  # noqa: PLR0913
        self,
        task_type_id: TaskType,
        task_type: str,
        data_set_id: int,
        target_name: str,
        estimation_procedure_id: int = 1,
        estimation_procedure_type: str | None = None,
        estimation_parameters: dict[str, str] | None = None,
        evaluation_measure: str | None = None,
        data_splits_url: str | None = None,
        task_id: int | None = None,
    ):
        super().__init__(
            task_id=task_id,
            task_type_id=task_type_id,
            task_type=task_type,
            data_set_id=data_set_id,
            estimation_procedure_id=estimation_procedure_id,
            estimation_procedure_type=estimation_procedure_type,
            estimation_parameters=estimation_parameters,
            evaluation_measure=evaluation_measure,
            data_splits_url=data_splits_url,
        )

        self.target_name = target_name

    @overload
    def get_X_and_y(
        self, dataset_format: Literal["array"] = "array"
    ) -> tuple[
        np.ndarray | scipy.sparse.spmatrix,
        np.ndarray | None,
    ]:
        ...

    @overload
    def get_X_and_y(
        self, dataset_format: Literal["dataframe"]
    ) -> tuple[
        pd.DataFrame,
        pd.Series | pd.DataFrame | None,
    ]:
        ...

    # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`?
    def get_X_and_y(
        self, dataset_format: Literal["dataframe", "array"] = "array"
    ) -> tuple[
        np.ndarray | pd.DataFrame | scipy.sparse.spmatrix,
        np.ndarray | pd.Series | pd.DataFrame | None,
    ]:
        """Get data associated with the current task.

        Parameters
        ----------
        dataset_format : str
            Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
            for possible options.

        Returns
        -------
        tuple - X and y

        """
        # TODO: [0.15]
        if dataset_format == "array":
            warnings.warn(
                "Support for `dataset_format='array'` will be removed in 0.15,"
                "start using `dataset_format='dataframe' to ensure your code "
                "will continue to work. You can use the dataframe's `to_numpy` "
                "function to continue using numpy arrays.",
                category=FutureWarning,
                stacklevel=2,
            )
        dataset = self.get_dataset()
        if self.task_type_id not in (
            TaskType.SUPERVISED_CLASSIFICATION,
            TaskType.SUPERVISED_REGRESSION,
            TaskType.LEARNING_CURVE,
        ):
            raise NotImplementedError(self.task_type)

        X, y, _, _ = dataset.get_data(
            dataset_format=dataset_format,
            target=self.target_name,
        )
        return X, y

    def _to_dict(self) -> dict[str, dict]:
        task_container = super()._to_dict()
        oml_input = task_container["oml:task_inputs"]["oml:input"]  # type: ignore
        assert isinstance(oml_input, list)

        oml_input.append({"@name": "target_feature", "#text": self.target_name})
        return task_container

    @property
    def estimation_parameters(self) -> dict[str, str] | None:
        """Return the estimation parameters for the task."""
        warnings.warn(
            "The estimation_parameters attribute will be "
            "deprecated in the future, please use "
            "estimation_procedure['parameters'] instead",
            PendingDeprecationWarning,
            stacklevel=2,
        )
        return self.estimation_procedure["parameters"]

    @estimation_parameters.setter
    def estimation_parameters(self, est_parameters: dict[str, str] | None) -> None:
        self.estimation_procedure["parameters"] = est_parameters

estimation_parameters: dict[str, str] | None property writable

Return the estimation parameters for the task.

get_X_and_y(dataset_format='array')

get_X_and_y(dataset_format: Literal['array'] = 'array') -> tuple[np.ndarray | scipy.sparse.spmatrix, np.ndarray | None]
get_X_and_y(dataset_format: Literal['dataframe']) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]

Get data associated with the current task.

Parameters:

Name Type Description Default
dataset_format str

Data structure of the returned data. See :meth:openml.datasets.OpenMLDataset.get_data for possible options.

'array'

Returns:

Type Description
tuple - X and y
Source code in openml/tasks/task.py
def get_X_and_y(
    self, dataset_format: Literal["dataframe", "array"] = "array"
) -> tuple[
    np.ndarray | pd.DataFrame | scipy.sparse.spmatrix,
    np.ndarray | pd.Series | pd.DataFrame | None,
]:
    """Get data associated with the current task.

    Parameters
    ----------
    dataset_format : str
        Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
        for possible options.

    Returns
    -------
    tuple - X and y

    """
    # TODO: [0.15]
    if dataset_format == "array":
        warnings.warn(
            "Support for `dataset_format='array'` will be removed in 0.15,"
            "start using `dataset_format='dataframe' to ensure your code "
            "will continue to work. You can use the dataframe's `to_numpy` "
            "function to continue using numpy arrays.",
            category=FutureWarning,
            stacklevel=2,
        )
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(self.task_type)

    X, y, _, _ = dataset.get_data(
        dataset_format=dataset_format,
        target=self.target_name,
    )
    return X, y

OpenMLTask

Bases: OpenMLBase

OpenML Task object.

Parameters:

Name Type Description Default
task_id int | None

Refers to the unique identifier of OpenML task.

required
task_type_id TaskType

Refers to the type of OpenML task.

required
task_type str

Refers to the OpenML task.

required
data_set_id int

Refers to the data.

required
estimation_procedure_id int

Refers to the type of estimates used.

1
estimation_procedure_type str | None

Refers to the type of estimation procedure used for the OpenML task.

None
estimation_parameters dict[str, str] | None

Estimation parameters used for the OpenML task.

None
evaluation_measure str | None

Refers to the evaluation measure.

None
data_splits_url str | None

Refers to the URL of the data splits used for the OpenML task.

None
Source code in openml/tasks/task.py
class OpenMLTask(OpenMLBase):
    """OpenML Task object.

    Parameters
    ----------
    task_id: Union[int, None]
        Refers to the unique identifier of OpenML task.
    task_type_id: TaskType
        Refers to the type of OpenML task.
    task_type: str
        Refers to the OpenML task.
    data_set_id: int
        Refers to the data.
    estimation_procedure_id: int
        Refers to the type of estimates used.
    estimation_procedure_type: str, default=None
        Refers to the type of estimation procedure used for the OpenML task.
    estimation_parameters: [Dict[str, str]], default=None
        Estimation parameters used for the OpenML task.
    evaluation_measure: str, default=None
        Refers to the evaluation measure.
    data_splits_url: str, default=None
        Refers to the URL of the data splits used for the OpenML task.
    """

    def __init__(  # noqa: PLR0913
        self,
        task_id: int | None,
        task_type_id: TaskType,
        task_type: str,
        data_set_id: int,
        estimation_procedure_id: int = 1,
        estimation_procedure_type: str | None = None,
        estimation_parameters: dict[str, str] | None = None,
        evaluation_measure: str | None = None,
        data_splits_url: str | None = None,
    ):
        self.task_id = int(task_id) if task_id is not None else None
        self.task_type_id = task_type_id
        self.task_type = task_type
        self.dataset_id = int(data_set_id)
        self.evaluation_measure = evaluation_measure
        self.estimation_procedure: _EstimationProcedure = {
            "type": estimation_procedure_type,
            "parameters": estimation_parameters,
            "data_splits_url": data_splits_url,
        }
        self.estimation_procedure_id = estimation_procedure_id
        self.split: OpenMLSplit | None = None

    @classmethod
    def _entity_letter(cls) -> str:
        return "t"

    @property
    def id(self) -> int | None:
        """Return the OpenML ID of this task."""
        return self.task_id

    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
        """Collect all information to display in the __repr__ body."""
        base_server_url = openml.config.get_server_base_url()
        fields: dict[str, Any] = {
            "Task Type Description": f"{base_server_url}/tt/{self.task_type_id}"
        }
        if self.task_id is not None:
            fields["Task ID"] = self.task_id
            fields["Task URL"] = self.openml_url
        if self.evaluation_measure is not None:
            fields["Evaluation Measure"] = self.evaluation_measure
        if self.estimation_procedure is not None:
            fields["Estimation Procedure"] = self.estimation_procedure["type"]

        # TODO(eddiebergman): Subclasses could advertise/provide this, instead of having to
        # have the base class know about it's subclasses.
        target_name = getattr(self, "target_name", None)
        if target_name is not None:
            fields["Target Feature"] = target_name

            class_labels = getattr(self, "class_labels", None)
            if class_labels is not None:
                fields["# of Classes"] = len(class_labels)

            if hasattr(self, "cost_matrix"):
                fields["Cost Matrix"] = "Available"

        # determines the order in which the information will be printed
        order = [
            "Task Type Description",
            "Task ID",
            "Task URL",
            "Estimation Procedure",
            "Evaluation Measure",
            "Target Feature",
            "# of Classes",
            "Cost Matrix",
        ]
        return [(key, fields[key]) for key in order if key in fields]

    def get_dataset(self) -> datasets.OpenMLDataset:
        """Download dataset associated with task."""
        return datasets.get_dataset(self.dataset_id)

    def get_train_test_split_indices(
        self,
        fold: int = 0,
        repeat: int = 0,
        sample: int = 0,
    ) -> tuple[np.ndarray, np.ndarray]:
        """Get the indices of the train and test splits for a given task."""
        # Replace with retrieve from cache
        if self.split is None:
            self.split = self.download_split()

        return self.split.get(repeat=repeat, fold=fold, sample=sample)

    def _download_split(self, cache_file: Path) -> None:
        # TODO(eddiebergman): Not sure about this try to read and error approach
        try:
            with cache_file.open(encoding="utf8"):
                pass
        except OSError:
            split_url = self.estimation_procedure["data_splits_url"]
            openml._api_calls._download_text_file(
                source=str(split_url),
                output_path=str(cache_file),
            )

    def download_split(self) -> OpenMLSplit:
        """Download the OpenML split for a given task."""
        # TODO(eddiebergman): Can this every be `None`?
        assert self.task_id is not None
        cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
        cached_split_file = cache_dir / "datasplits.arff"

        try:
            split = OpenMLSplit._from_arff_file(cached_split_file)
        except OSError:
            # Next, download and cache the associated split file
            self._download_split(cached_split_file)
            split = OpenMLSplit._from_arff_file(cached_split_file)

        return split

    def get_split_dimensions(self) -> tuple[int, int, int]:
        """Get the (repeats, folds, samples) of the split for a given task."""
        if self.split is None:
            self.split = self.download_split()

        return self.split.repeats, self.split.folds, self.split.samples

    # TODO(eddiebergman): Really need some better typing on all this
    def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
        """Creates a dictionary representation of self in a string format (for XML parsing)."""
        oml_input = [
            {"@name": "source_data", "#text": str(self.dataset_id)},
            {"@name": "estimation_procedure", "#text": str(self.estimation_procedure_id)},
        ]
        if self.evaluation_measure is not None:  #
            oml_input.append({"@name": "evaluation_measures", "#text": self.evaluation_measure})

        return {
            "oml:task_inputs": {
                "@xmlns:oml": "http://openml.org/openml",
                "oml:task_type_id": self.task_type_id.value,  # This is an int from the enum?
                "oml:input": oml_input,
            }
        }

    def _parse_publish_response(self, xml_response: dict) -> None:
        """Parse the id from the xml_response and assign it to self."""
        self.task_id = int(xml_response["oml:upload_task"]["oml:id"])

id: int | None property

Return the OpenML ID of this task.

download_split()

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_dataset()

Download dataset associated with task.

Source code in openml/tasks/task.py
def get_dataset(self) -> datasets.OpenMLDataset:
    """Download dataset associated with task."""
    return datasets.get_dataset(self.dataset_id)

get_split_dimensions()

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices(fold=0, repeat=0, sample=0)

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

TaskType

Bases: Enum

Possible task types as defined in OpenML.

Source code in openml/tasks/task.py
class TaskType(Enum):
    """Possible task types as defined in OpenML."""

    SUPERVISED_CLASSIFICATION = 1
    SUPERVISED_REGRESSION = 2
    LEARNING_CURVE = 3
    SUPERVISED_DATASTREAM_CLASSIFICATION = 4
    CLUSTERING = 5
    MACHINE_LEARNING_CHALLENGE = 6
    SURVIVAL_ANALYSIS = 7
    SUBGROUP_DISCOVERY = 8
    MULTITASK_REGRESSION = 9

create_task(task_type, dataset_id, estimation_procedure_id, target_name=None, evaluation_measure=None, **kwargs)

Create a task based on different given attributes.

Builds a task object with the function arguments as attributes. The type of the task object built is determined from the task type id. More information on how the arguments (task attributes), relate to the different possible tasks can be found in the individual task objects at the openml.tasks.task module.

Parameters:

Name Type Description Default
task_type TaskType

Id of the task type.

required
dataset_id int

The id of the dataset for the task.

required
target_name str

The name of the feature used as a target. At the moment, only optional for the clustering tasks.

None
estimation_procedure_id int

The id of the estimation procedure.

required
evaluation_measure str

The name of the evaluation measure.

None
kwargs dict

Other task attributes that are not mandatory for task upload.

{}

Returns:

Type Description
(OpenMLClassificationTask, OpenMLRegressionTask)
(OpenMLLearningCurveTask, OpenMLClusteringTask)
Source code in openml/tasks/functions.py
def create_task(
    task_type: TaskType,
    dataset_id: int,
    estimation_procedure_id: int,
    target_name: str | None = None,
    evaluation_measure: str | None = None,
    **kwargs: Any,
) -> (
    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
):
    """Create a task based on different given attributes.

    Builds a task object with the function arguments as
    attributes. The type of the task object built is
    determined from the task type id.
    More information on how the arguments (task attributes),
    relate to the different possible tasks can be found in
    the individual task objects at the openml.tasks.task
    module.

    Parameters
    ----------
    task_type : TaskType
        Id of the task type.
    dataset_id : int
        The id of the dataset for the task.
    target_name : str, optional
        The name of the feature used as a target.
        At the moment, only optional for the clustering tasks.
    estimation_procedure_id : int
        The id of the estimation procedure.
    evaluation_measure : str, optional
        The name of the evaluation measure.
    kwargs : dict, optional
        Other task attributes that are not mandatory
        for task upload.

    Returns
    -------
    OpenMLClassificationTask, OpenMLRegressionTask,
    OpenMLLearningCurveTask, OpenMLClusteringTask
    """
    if task_type == TaskType.CLUSTERING:
        task_cls = OpenMLClusteringTask
    elif task_type == TaskType.LEARNING_CURVE:
        task_cls = OpenMLLearningCurveTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
        task_cls = OpenMLClassificationTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_REGRESSION:
        task_cls = OpenMLRegressionTask  # type: ignore
    else:
        raise NotImplementedError(f"Task type {task_type:d} not supported.")

    return task_cls(
        task_type_id=task_type,
        task_type="None",  # TODO: refactor to get task type string from ID.
        data_set_id=dataset_id,
        target_name=target_name,
        estimation_procedure_id=estimation_procedure_id,
        evaluation_measure=evaluation_measure,
        **kwargs,
    )

delete_task(task_id)

Delete task with id task_id from the OpenML server.

You can only delete tasks which you created and have no runs associated with them.

Parameters:

Name Type Description Default
task_id int

OpenML id of the task

required

Returns:

Type Description
bool

True if the deletion was successful. False otherwise.

Source code in openml/tasks/functions.py
def delete_task(task_id: int) -> bool:
    """Delete task with id `task_id` from the OpenML server.

    You can only delete tasks which you created and have
    no runs associated with them.

    Parameters
    ----------
    task_id : int
        OpenML id of the task

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("task", task_id)

get_task(task_id, *dataset_args, download_splits=None, **get_dataset_kwargs)

Download OpenML task for a given task ID.

Downloads the task representation. By default, this will also download the data splits and the dataset. From version 0.15.0 onwards, the splits nor the dataset will not be downloaded by default.

Use the download_splits parameter to control whether the splits are downloaded. Moreover, you may pass additional parameter (args or kwargs) that are passed to :meth:openml.datasets.get_dataset. For backwards compatibility, if download_data is passed as an additional parameter and download_splits is not explicitly set, download_data also overrules download_splits's value (deprecated from Version 0.15.0 onwards).

Parameters:

Name Type Description Default
task_id int

The OpenML task id of the task to download.

required
download_splits bool | None

Whether to download the splits as well. From version 0.15.0 onwards this is independent of download_data and will default to False.

None
dataset_args Any

Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset. This includes download_data. If set to True the splits are downloaded as well (deprecated from Version 0.15.0 onwards). The args are only present for backwards compatibility and will be removed from version 0.15.0 onwards.

()
get_dataset_kwargs Any

Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset. This includes download_data. If set to True the splits are downloaded as well (deprecated from Version 0.15.0 onwards). The args are only present for backwards compatibility and will be removed from version 0.15.0 onwards.

()

Returns:

Name Type Description
task OpenMLTask
Source code in openml/tasks/functions.py
@openml.utils.thread_safe_if_oslo_installed
def get_task(
    task_id: int,
    *dataset_args: Any,
    download_splits: bool | None = None,
    **get_dataset_kwargs: Any,
) -> OpenMLTask:
    """Download OpenML task for a given task ID.

    Downloads the task representation. By default, this will also download the data splits and
    the dataset. From version 0.15.0 onwards, the splits nor the dataset will not be downloaded by
    default.

    Use the `download_splits` parameter to control whether the splits are downloaded.
    Moreover, you may pass additional parameter (args or kwargs) that are passed to
    :meth:`openml.datasets.get_dataset`.
    For backwards compatibility, if `download_data` is passed as an additional parameter and
    `download_splits` is not explicitly set, `download_data` also overrules `download_splits`'s
    value (deprecated from Version 0.15.0 onwards).

    Parameters
    ----------
    task_id : int
        The OpenML task id of the task to download.
    download_splits: bool (default=True)
        Whether to download the splits as well. From version 0.15.0 onwards this is independent
        of download_data and will default to ``False``.
    dataset_args, get_dataset_kwargs :
        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.
        This includes `download_data`. If set to True the splits are downloaded as well
        (deprecated from Version 0.15.0 onwards). The args are only present for backwards
        compatibility and will be removed from version 0.15.0 onwards.

    Returns
    -------
    task: OpenMLTask
    """
    if download_splits is None:
        # TODO(0.15): Switch download splits to False by default, adjust typing above, adjust
        #  documentation above, and remove warning.
        warnings.warn(
            "Starting from Version 0.15.0 `download_splits` will default to ``False`` instead "
            "of ``True`` and be independent from `download_data`. To disable this message until "
            "version 0.15 explicitly set `download_splits` to a bool.",
            FutureWarning,
            stacklevel=3,
        )
        download_splits = get_dataset_kwargs.get("download_data", True)

    if not isinstance(task_id, int):
        # TODO(0.15): Remove warning
        warnings.warn(
            "Task id must be specified as `int` from 0.14.0 onwards.",
            FutureWarning,
            stacklevel=3,
        )

    try:
        task_id = int(task_id)
    except (ValueError, TypeError) as e:
        raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") from e

    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)

    try:
        task = _get_task_description(task_id)
        dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
        # List of class labels available in dataset description
        # Including class labels as part of task meta data handles
        #   the case where data download was initially disabled
        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
            task.class_labels = dataset.retrieve_class_labels(task.target_name)
        # Clustering tasks do not have class labels
        # and do not offer download_split
        if download_splits and isinstance(task, OpenMLSupervisedTask):
            task.download_split()
    except Exception as e:
        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
        raise e

    return task

get_tasks(task_ids, download_data=True, download_qualities=True)

Download tasks.

This function iterates :meth:openml.tasks.get_task.

Parameters:

Name Type Description Default
task_ids List[int]

A list of task ids to download.

required
download_data bool(default=True)

Option to trigger download of data along with the meta data.

True
download_qualities bool(default=True)

Option to download 'qualities' meta-data in addition to the minimal dataset description.

True

Returns:

Type Description
list
Source code in openml/tasks/functions.py
def get_tasks(
    task_ids: list[int],
    download_data: bool = True,  # noqa: FBT001, FBT002
    download_qualities: bool = True,  # noqa: FBT001, FBT002
) -> list[OpenMLTask]:
    """Download tasks.

    This function iterates :meth:`openml.tasks.get_task`.

    Parameters
    ----------
    task_ids : List[int]
        A list of task ids to download.
    download_data : bool (default = True)
        Option to trigger download of data along with the meta data.
    download_qualities : bool (default=True)
        Option to download 'qualities' meta-data in addition to the minimal dataset description.

    Returns
    -------
    list
    """
    tasks = []
    for task_id in task_ids:
        tasks.append(get_task(task_id, download_data, download_qualities))
    return tasks

list_tasks(task_type=None, offset=None, size=None, tag=None, output_format='dict', **kwargs)

Return a number of tasks having the given tag and task_type

Parameters:

Name Type Description Default
Filter
required
it
required
type
required
task_type TaskType

Refers to the type of task.

None
offset int

the number of tasks to skip, starting from the first

None
size int

the maximum number of tasks to show

None
tag str

the tag to include

None
output_format Literal['dict', 'dataframe']

The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame

'dict'
kwargs Any

Legal filter operators: data_tag, status, data_id, data_name, number_instances, number_features, number_classes, number_missing_values.

{}

Returns:

Type Description
dict

All tasks having the given task_type and the give tag. Every task is represented by a dictionary containing the following information: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

dataframe

All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

Source code in openml/tasks/functions.py
def list_tasks(
    task_type: TaskType | None = None,
    offset: int | None = None,
    size: int | None = None,
    tag: str | None = None,
    output_format: Literal["dict", "dataframe"] = "dict",
    **kwargs: Any,
) -> dict | pd.DataFrame:
    """
    Return a number of tasks having the given tag and task_type

    Parameters
    ----------
    Filter task_type is separated from the other filters because
    it is used as task_type in the task description, but it is named
    type when used as a filter in list tasks call.
    task_type : TaskType, optional
        Refers to the type of task.
    offset : int, optional
        the number of tasks to skip, starting from the first
    size : int, optional
        the maximum number of tasks to show
    tag : str, optional
        the tag to include
    output_format: str, optional (default='dict')
        The parameter decides the format of the output.
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame
    kwargs: dict, optional
        Legal filter operators: data_tag, status, data_id, data_name,
        number_instances, number_features,
        number_classes, number_missing_values.

    Returns
    -------
    dict
        All tasks having the given task_type and the give tag. Every task is
        represented by a dictionary containing the following information:
        task id, dataset id, task_type and status. If qualities are calculated
        for the associated dataset, some of these are also returned.
    dataframe
        All tasks having the given task_type and the give tag. Every task is
        represented by a row in the data frame containing the following information
        as columns: task id, dataset id, task_type and status. If qualities are
        calculated for the associated dataset, some of these are also returned.
    """
    if output_format not in ["dataframe", "dict"]:
        raise ValueError(
            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
        )
    # TODO: [0.15]
    if output_format == "dict":
        msg = (
            "Support for `output_format` of 'dict' will be removed in 0.15 "
            "and pandas dataframes will be returned instead. To ensure your code "
            "will continue to work, use `output_format`='dataframe'."
        )
        warnings.warn(msg, category=FutureWarning, stacklevel=2)
    return openml.utils._list_all(  # type: ignore
        list_output_format=output_format,  # type: ignore
        listing_call=_list_tasks,
        task_type=task_type,
        offset=offset,
        size=size,
        tag=tag,
        **kwargs,
    )