Skip to content

functions

__list_runs(api_call, output_format='dict')

Helper function to parse API calls which are lists of runs

Source code in openml/runs/functions.py
def __list_runs(
    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
) -> dict | pd.DataFrame:
    """Helper function to parse API calls which are lists of runs"""
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
    # Minimalistic check if the XML is useful
    if "oml:runs" not in runs_dict:
        raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}')

    if "@xmlns:oml" not in runs_dict["oml:runs"]:
        raise ValueError(
            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {runs_dict}'
        )

    if runs_dict["oml:runs"]["@xmlns:oml"] != "http://openml.org/openml":
        raise ValueError(
            "Error in return XML, value of  "
            '"oml:runs"/@xmlns:oml is not '
            f'"http://openml.org/openml": {runs_dict}',
        )

    assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"])

    runs = {
        int(r["oml:run_id"]): {
            "run_id": int(r["oml:run_id"]),
            "task_id": int(r["oml:task_id"]),
            "setup_id": int(r["oml:setup_id"]),
            "flow_id": int(r["oml:flow_id"]),
            "uploader": int(r["oml:uploader"]),
            "task_type": TaskType(int(r["oml:task_type_id"])),
            "upload_time": str(r["oml:upload_time"]),
            "error_message": str((r["oml:error_message"]) or ""),
        }
        for r in runs_dict["oml:runs"]["oml:run"]
    }

    if output_format == "dataframe":
        runs = pd.DataFrame.from_dict(runs, orient="index")

    return runs

delete_run(run_id)

Delete run with id run_id from the OpenML server.

You can only delete runs which you uploaded.

Parameters:

Name Type Description Default
run_id int

OpenML id of the run

required

Returns:

Type Description
bool

True if the deletion was successful. False otherwise.

Source code in openml/runs/functions.py
def delete_run(run_id: int) -> bool:
    """Delete run with id `run_id` from the OpenML server.

    You can only delete runs which you uploaded.

    Parameters
    ----------
    run_id : int
        OpenML id of the run

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("run", run_id)

format_prediction(task, repeat, fold, index, prediction, truth, sample=None, proba=None)

Format the predictions in the specific order as required for the run results.

Parameters:

Name Type Description Default
task OpenMLSupervisedTask

Task for which to format the predictions.

required
repeat int

From which repeat this predictions is made.

required
fold int

From which fold this prediction is made.

required
index int

For which index this prediction is made.

required
prediction str | int | float

The predicted class label or value.

required
truth str | int | float

The true class label or value.

required
sample int | None

From which sample set this prediction is made. Required only for LearningCurve tasks.

None
proba dict[str, float] | None

For classification tasks only. A mapping from each class label to their predicted probability. The dictionary should contain an entry for each of the task.class_labels. E.g.: {"Iris-Setosa": 0.2, "Iris-Versicolor": 0.7, "Iris-Virginica": 0.1}

None

Returns:

Type Description
A list with elements for the prediction results of a run.
The returned order of the elements is (if available):

[repeat, fold, sample, index, prediction, truth, *probabilities]

This order follows the R Client API.
Source code in openml/runs/functions.py
def format_prediction(  # noqa: PLR0913
    task: OpenMLSupervisedTask,
    repeat: int,
    fold: int,
    index: int,
    prediction: str | int | float,
    truth: str | int | float,
    sample: int | None = None,
    proba: dict[str, float] | None = None,
) -> list[str | int | float]:
    """Format the predictions in the specific order as required for the run results.

    Parameters
    ----------
    task: OpenMLSupervisedTask
        Task for which to format the predictions.
    repeat: int
        From which repeat this predictions is made.
    fold: int
        From which fold this prediction is made.
    index: int
        For which index this prediction is made.
    prediction: str, int or float
        The predicted class label or value.
    truth: str, int or float
        The true class label or value.
    sample: int, optional (default=None)
        From which sample set this prediction is made.
        Required only for LearningCurve tasks.
    proba: Dict[str, float], optional (default=None)
        For classification tasks only.
        A mapping from each class label to their predicted probability.
        The dictionary should contain an entry for each of the `task.class_labels`.
        E.g.: {"Iris-Setosa": 0.2, "Iris-Versicolor": 0.7, "Iris-Virginica": 0.1}

    Returns
    -------
    A list with elements for the prediction results of a run.

    The returned order of the elements is (if available):
        [repeat, fold, sample, index, prediction, truth, *probabilities]

    This order follows the R Client API.
    """
    if isinstance(task, OpenMLClassificationTask):
        if proba is None:
            raise ValueError("`proba` is required for classification task")
        if task.class_labels is None:
            raise ValueError("The classification task must have class labels set")
        if not set(task.class_labels) == set(proba):
            raise ValueError("Each class should have a predicted probability")
        if sample is None:
            if isinstance(task, OpenMLLearningCurveTask):
                raise ValueError("`sample` can not be none for LearningCurveTask")

            sample = 0
        probabilities = [proba[c] for c in task.class_labels]
        return [repeat, fold, sample, index, prediction, truth, *probabilities]

    if isinstance(task, OpenMLRegressionTask):
        return [repeat, fold, index, prediction, truth]

    raise NotImplementedError(f"Formatting for {type(task)} is not supported.")

get_run(run_id, ignore_cache=False)

Gets run corresponding to run_id.

Parameters:

Name Type Description Default
run_id int
required
ignore_cache bool

Whether to ignore the cache. If true this will download and overwrite the run xml even if the requested run is already cached.

False
ignore_cache bool
False

Returns:

Name Type Description
run OpenMLRun

Run corresponding to ID, fetched from the server.

Source code in openml/runs/functions.py
@openml.utils.thread_safe_if_oslo_installed
def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:  # noqa: FBT002, FBT001
    """Gets run corresponding to run_id.

    Parameters
    ----------
    run_id : int

    ignore_cache : bool
        Whether to ignore the cache. If ``true`` this will download and overwrite the run xml
        even if the requested run is already cached.

    ignore_cache

    Returns
    -------
    run : OpenMLRun
        Run corresponding to ID, fetched from the server.
    """
    run_dir = Path(openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id))
    run_file = run_dir / "description.xml"

    run_dir.mkdir(parents=True, exist_ok=True)

    try:
        if not ignore_cache:
            return _get_cached_run(run_id)

        raise OpenMLCacheException(message="dummy")

    except OpenMLCacheException:
        run_xml = openml._api_calls._perform_api_call("run/%d" % run_id, "get")
        with run_file.open("w", encoding="utf8") as fh:
            fh.write(run_xml)

    return _create_run_from_xml(run_xml)

get_run_trace(run_id)

Get the optimization trace object for a given run id.

Parameters:

Name Type Description Default
run_id int
required

Returns:

Type Description
OpenMLTrace
Source code in openml/runs/functions.py
def get_run_trace(run_id: int) -> OpenMLRunTrace:
    """
    Get the optimization trace object for a given run id.

    Parameters
    ----------
    run_id : int

    Returns
    -------
    openml.runs.OpenMLTrace
    """
    trace_xml = openml._api_calls._perform_api_call("run/trace/%d" % run_id, "get")
    return OpenMLRunTrace.trace_from_xml(trace_xml)

get_runs(run_ids)

Gets all runs in run_ids list.

Parameters:

Name Type Description Default
run_ids list of ints
required

Returns:

Name Type Description
runs list of OpenMLRun

List of runs corresponding to IDs, fetched from the server.

Source code in openml/runs/functions.py
def get_runs(run_ids: list[int]) -> list[OpenMLRun]:
    """Gets all runs in run_ids list.

    Parameters
    ----------
    run_ids : list of ints

    Returns
    -------
    runs : list of OpenMLRun
        List of runs corresponding to IDs, fetched from the server.
    """
    runs = []
    for run_id in run_ids:
        runs.append(get_run(run_id))
    return runs

initialize_model_from_run(run_id)

Initialized a model based on a run_id (i.e., using the exact same parameter settings)

Parameters:

Name Type Description Default
run_id int

The Openml run_id

required

Returns:

Type Description
model
Source code in openml/runs/functions.py
def initialize_model_from_run(run_id: int) -> Any:
    """
    Initialized a model based on a run_id (i.e., using the exact
    same parameter settings)

    Parameters
    ----------
    run_id : int
        The Openml run_id

    Returns
    -------
    model
    """
    run = get_run(run_id)
    # TODO(eddiebergman): I imagine this is None if it's not published,
    # might need to raise an explicit error for that
    assert run.setup_id is not None
    return initialize_model(run.setup_id)

initialize_model_from_trace(run_id, repeat, fold, iteration=None)

Initialize a model based on the parameters that were set by an optimization procedure (i.e., using the exact same parameter settings)

Parameters:

Name Type Description Default
run_id int

The Openml run_id. Should contain a trace file, otherwise a OpenMLServerException is raised

required
repeat int

The repeat nr (column in trace file)

required
fold int

The fold nr (column in trace file)

required
iteration int

The iteration nr (column in trace file). If None, the best (selected) iteration will be searched (slow), according to the selection criteria implemented in OpenMLRunTrace.get_selected_iteration

None

Returns:

Type Description
model
Source code in openml/runs/functions.py
def initialize_model_from_trace(
    run_id: int,
    repeat: int,
    fold: int,
    iteration: int | None = None,
) -> Any:
    """
    Initialize a model based on the parameters that were set
    by an optimization procedure (i.e., using the exact same
    parameter settings)

    Parameters
    ----------
    run_id : int
        The Openml run_id. Should contain a trace file,
        otherwise a OpenMLServerException is raised

    repeat : int
        The repeat nr (column in trace file)

    fold : int
        The fold nr (column in trace file)

    iteration : int
        The iteration nr (column in trace file). If None, the
        best (selected) iteration will be searched (slow),
        according to the selection criteria implemented in
        OpenMLRunTrace.get_selected_iteration

    Returns
    -------
    model
    """
    run = get_run(run_id)
    # TODO(eddiebergman): I imagine this is None if it's not published,
    # might need to raise an explicit error for that
    assert run.flow_id is not None

    flow = get_flow(run.flow_id)
    run_trace = get_run_trace(run_id)

    if iteration is None:
        iteration = run_trace.get_selected_iteration(repeat, fold)

    request = (repeat, fold, iteration)
    if request not in run_trace.trace_iterations:
        raise ValueError("Combination repeat, fold, iteration not available")
    current = run_trace.trace_iterations[(repeat, fold, iteration)]

    search_model = initialize_model_from_run(run_id)
    return flow.extension.instantiate_model_from_hpo_class(search_model, current)

list_runs(offset=None, size=None, id=None, task=None, setup=None, flow=None, uploader=None, tag=None, study=None, display_errors=False, output_format='dict', **kwargs)

List all runs matching all of the given filters. (Supports large amount of results)

Parameters:

Name Type Description Default
offset int

the number of runs to skip, starting from the first

None
size int

the maximum number of runs to show

None
id list
None
task list
None
setup list | None
None
flow list
None
uploader list
None
tag str
None
study int
None
display_errors (bool, optional(default=None))

Whether to list runs which have an error (for example a missing prediction file).

False
output_format Literal['dict', 'dataframe']

The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame

'dict'
kwargs dict

Legal filter operators: task_type.

{}

Returns:

Type Description
dict of dicts, or dataframe
Source code in openml/runs/functions.py
def list_runs(  # noqa: PLR0913
    offset: int | None = None,
    size: int | None = None,
    id: list | None = None,  # noqa: A002
    task: list[int] | None = None,
    setup: list | None = None,
    flow: list | None = None,
    uploader: list | None = None,
    tag: str | None = None,
    study: int | None = None,
    display_errors: bool = False,  # noqa: FBT001, FBT002
    output_format: Literal["dict", "dataframe"] = "dict",
    **kwargs: Any,
) -> dict | pd.DataFrame:
    """
    List all runs matching all of the given filters.
    (Supports large amount of results)

    Parameters
    ----------
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, optional
        the maximum number of runs to show

    id : list, optional

    task : list, optional

    setup: list, optional

    flow : list, optional

    uploader : list, optional

    tag : str, optional

    study : int, optional

    display_errors : bool, optional (default=None)
        Whether to list runs which have an error (for example a missing
        prediction file).

    output_format: str, optional (default='dict')
        The parameter decides the format of the output.
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame

    kwargs : dict, optional
        Legal filter operators: task_type.

    Returns
    -------
    dict of dicts, or dataframe
    """
    if output_format not in ["dataframe", "dict"]:
        raise ValueError("Invalid output format selected. Only 'dict' or 'dataframe' applicable.")

    # TODO: [0.15]
    if output_format == "dict":
        msg = (
            "Support for `output_format` of 'dict' will be removed in 0.15 "
            "and pandas dataframes will be returned instead. To ensure your code "
            "will continue to work, use `output_format`='dataframe'."
        )
        warnings.warn(msg, category=FutureWarning, stacklevel=2)

    # TODO(eddiebergman): Do we really need this runtime type validation?
    if id is not None and (not isinstance(id, list)):
        raise TypeError("id must be of type list.")
    if task is not None and (not isinstance(task, list)):
        raise TypeError("task must be of type list.")
    if setup is not None and (not isinstance(setup, list)):
        raise TypeError("setup must be of type list.")
    if flow is not None and (not isinstance(flow, list)):
        raise TypeError("flow must be of type list.")
    if uploader is not None and (not isinstance(uploader, list)):
        raise TypeError("uploader must be of type list.")

    return openml.utils._list_all(  # type: ignore
        list_output_format=output_format,  # type: ignore
        listing_call=_list_runs,
        offset=offset,
        size=size,
        id=id,
        task=task,
        setup=setup,
        flow=flow,
        uploader=uploader,
        tag=tag,
        study=study,
        display_errors=display_errors,
        **kwargs,
    )

run_exists(task_id, setup_id)

Checks whether a task/setup combination is already present on the server.

Parameters:

Name Type Description Default
task_id int
required
setup_id int
required

Returns:

Type Description
Set run ids for runs where flow setup_id was run on task_id. Empty

set if it wasn't run yet.

Source code in openml/runs/functions.py
def run_exists(task_id: int, setup_id: int) -> set[int]:
    """Checks whether a task/setup combination is already present on the
    server.

    Parameters
    ----------
    task_id : int

    setup_id : int

    Returns
    -------
        Set run ids for runs where flow setup_id was run on task_id. Empty
        set if it wasn't run yet.
    """
    if setup_id <= 0:
        # openml setups are in range 1-inf
        return set()

    try:
        result = list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
        assert isinstance(result, pd.DataFrame)  # TODO(eddiebergman): Remove once #1299
        return set() if result.empty else set(result["run_id"])
    except OpenMLServerException as exception:
        # error code implies no results. The run does not exist yet
        if exception.code != ERROR_CODE:
            raise exception
        return set()

run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None, seed=None, add_local_measures=True, upload_flow=False, dataset_format='dataframe', n_jobs=None)

Run the model provided by the flow on the dataset defined by task.

Takes the flow and repeat information into account. The Flow may optionally be published.

Parameters:

Name Type Description Default
flow OpenMLFlow

A flow wraps a machine learning model together with relevant information. The model has a function fit(X,Y) and predict(X), all supervised estimators of scikit learn follow this definition of a model (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)

required
task OpenMLTask

Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask.

required
avoid_duplicate_runs (bool, optional(default=True))

If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection.

True
avoid_duplicate_runs (bool, optional(default=True))

If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection.

True
flow_tags (List[str], optional(default=None))

A list of tags that the flow should have at creation.

None
seed int | None

Models that are not seeded will get this seed.

None
add_local_measures (bool, optional(default=True))

Determines whether to calculate a set of evaluation measures locally, to later verify server behaviour.

True
upload_flow bool(default=False)

If True, upload the flow to OpenML if it does not exist yet. If False, do not upload the flow to OpenML.

False
dataset_format str(default='dataframe')

If 'array', the dataset is passed to the model as a numpy array. If 'dataframe', the dataset is passed to the model as a pandas dataframe.

'dataframe'
n_jobs int(default=None)

The number of processes/threads to distribute the evaluation asynchronously. If None or 1, then the evaluation is treated as synchronous and processed sequentially. If -1, then the job uses as many cores available.

None

Returns:

Name Type Description
run OpenMLRun

Result of the run.

Source code in openml/runs/functions.py
def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
    flow: OpenMLFlow,
    task: OpenMLTask,
    avoid_duplicate_runs: bool = True,  # noqa: FBT002, FBT001
    flow_tags: list[str] | None = None,
    seed: int | None = None,
    add_local_measures: bool = True,  # noqa: FBT001, FBT002
    upload_flow: bool = False,  # noqa: FBT001, FBT002
    dataset_format: Literal["array", "dataframe"] = "dataframe",
    n_jobs: int | None = None,
) -> OpenMLRun:
    """Run the model provided by the flow on the dataset defined by task.

    Takes the flow and repeat information into account.
    The Flow may optionally be published.

    Parameters
    ----------
    flow : OpenMLFlow
        A flow wraps a machine learning model together with relevant information.
        The model has a function fit(X,Y) and predict(X),
        all supervised estimators of scikit learn follow this definition of a model
        (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
    task : OpenMLTask
        Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask.
    avoid_duplicate_runs : bool, optional (default=True)
        If True, the run will throw an error if the setup/task combination is already present on
        the server. This feature requires an internet connection.
    avoid_duplicate_runs : bool, optional (default=True)
        If True, the run will throw an error if the setup/task combination is already present on
        the server. This feature requires an internet connection.
    flow_tags : List[str], optional (default=None)
        A list of tags that the flow should have at creation.
    seed: int, optional (default=None)
        Models that are not seeded will get this seed.
    add_local_measures : bool, optional (default=True)
        Determines whether to calculate a set of evaluation measures locally,
        to later verify server behaviour.
    upload_flow : bool (default=False)
        If True, upload the flow to OpenML if it does not exist yet.
        If False, do not upload the flow to OpenML.
    dataset_format : str (default='dataframe')
        If 'array', the dataset is passed to the model as a numpy array.
        If 'dataframe', the dataset is passed to the model as a pandas dataframe.
    n_jobs : int (default=None)
        The number of processes/threads to distribute the evaluation asynchronously.
        If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
        If `-1`, then the job uses as many cores available.

    Returns
    -------
    run : OpenMLRun
        Result of the run.
    """
    if flow_tags is not None and not isinstance(flow_tags, list):
        raise ValueError("flow_tags should be a list")

    # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
    # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
    if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
        # We want to allow either order of argument (to avoid confusion).
        warnings.warn(
            "The old argument order (Flow, model) is deprecated and "
            "will not be supported in the future. Please use the "
            "order (model, Flow).",
            DeprecationWarning,
            stacklevel=2,
        )
        task, flow = flow, task

    if task.task_id is None:
        raise ValueError("The task should be published at OpenML")

    if flow.model is None:
        flow.model = flow.extension.flow_to_model(flow)

    flow.model = flow.extension.seed_model(flow.model, seed=seed)

    # We only need to sync with the server right now if we want to upload the flow,
    # or ensure no duplicate runs exist. Otherwise it can be synced at upload time.
    flow_id = None
    if upload_flow or avoid_duplicate_runs:
        flow_id = flow_exists(flow.name, flow.external_version)
        if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
            if flow_id is not False:
                raise PyOpenMLError(
                    "Local flow_id does not match server flow_id: "
                    f"'{flow.flow_id}' vs '{flow_id}'",
                )
            raise PyOpenMLError(
                "Flow does not exist on the server, but 'flow.flow_id' is not None."
            )
        if upload_flow and flow_id is False:
            flow.publish()
            flow_id = flow.flow_id
        elif flow_id:
            flow_from_server = get_flow(flow_id)
            _copy_server_fields(flow_from_server, flow)
            if avoid_duplicate_runs:
                flow_from_server.model = flow.model
                setup_id = setup_exists(flow_from_server)
                ids = run_exists(task.task_id, setup_id)
                if ids:
                    error_message = (
                        "One or more runs of this setup were already performed on the task."
                    )
                    raise OpenMLRunsExistError(ids, error_message)
        else:
            # Flow does not exist on server and we do not want to upload it.
            # No sync with the server happens.
            flow_id = None

    dataset = task.get_dataset()

    run_environment = flow.extension.get_version_information()
    tags = ["openml-python", run_environment[1]]

    if flow.extension.check_if_model_fitted(flow.model):
        warnings.warn(
            "The model is already fitted!"
            " This might cause inconsistency in comparison of results.",
            RuntimeWarning,
            stacklevel=2,
        )

    # execute the run
    res = _run_task_get_arffcontent(
        model=flow.model,
        task=task,
        extension=flow.extension,
        add_local_measures=add_local_measures,
        dataset_format=dataset_format,
        n_jobs=n_jobs,
    )

    data_content, trace, fold_evaluations, sample_evaluations = res
    fields = [*run_environment, time.strftime("%c"), "Created by run_flow_on_task"]
    generated_description = "\n".join(fields)
    run = OpenMLRun(
        task_id=task.task_id,
        flow_id=flow_id,
        dataset_id=dataset.dataset_id,
        model=flow.model,
        flow_name=flow.name,
        tags=tags,
        trace=trace,
        data_content=data_content,
        flow=flow,
        setup_string=flow.extension.create_setup_string(flow.model),
        description_text=generated_description,
    )

    if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
        # We only extract the parameter settings if a sync happened with the server.
        # I.e. when the flow was uploaded or we found it in the avoid_duplicate check.
        # Otherwise, we will do this at upload time.
        run.parameter_settings = flow.extension.obtain_parameter_values(flow)

    # now we need to attach the detailed evaluations
    if task.task_type_id == TaskType.LEARNING_CURVE:
        run.sample_evaluations = sample_evaluations
    else:
        run.fold_evaluations = fold_evaluations

    if flow_id:
        message = f"Executed Task {task.task_id} with Flow id:{run.flow_id}"
    else:
        message = f"Executed Task {task.task_id} on local Flow with name {flow.name}."
    config.logger.info(message)

    return run

run_model_on_task(model, task, avoid_duplicate_runs=True, flow_tags=None, seed=None, add_local_measures=True, upload_flow=False, return_flow=False, dataset_format='dataframe', n_jobs=None)

Run the model on the dataset defined by the task.

Parameters:

Name Type Description Default
model sklearn model

A model which has a function fit(X,Y) and predict(X), all supervised estimators of scikit learn follow this definition of a model (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)

required
task OpenMLTask or int or str

Task to perform or Task id. This may be a model instead if the first argument is an OpenMLTask.

required
avoid_duplicate_runs (bool, optional(default=True))

If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection.

True
flow_tags (List[str], optional(default=None))

A list of tags that the flow should have at creation.

None
seed int | None

Models that are not seeded will get this seed.

None
add_local_measures (bool, optional(default=True))

Determines whether to calculate a set of evaluation measures locally, to later verify server behaviour.

True
upload_flow bool(default=False)

If True, upload the flow to OpenML if it does not exist yet. If False, do not upload the flow to OpenML.

False
return_flow bool(default=False)

If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun.

False
dataset_format str(default='dataframe')

If 'array', the dataset is passed to the model as a numpy array. If 'dataframe', the dataset is passed to the model as a pandas dataframe.

'dataframe'
n_jobs int(default=None)

The number of processes/threads to distribute the evaluation asynchronously. If None or 1, then the evaluation is treated as synchronous and processed sequentially. If -1, then the job uses as many cores available.

None

Returns:

Name Type Description
run OpenMLRun

Result of the run.

flow OpenMLFlow (optional, only if `return_flow` is True).

Flow generated from the model.

Source code in openml/runs/functions.py
def run_model_on_task(  # noqa: PLR0913
    model: Any,
    task: int | str | OpenMLTask,
    avoid_duplicate_runs: bool = True,  # noqa: FBT001, FBT002
    flow_tags: list[str] | None = None,
    seed: int | None = None,
    add_local_measures: bool = True,  # noqa: FBT001, FBT002
    upload_flow: bool = False,  # noqa: FBT001, FBT002
    return_flow: bool = False,  # noqa: FBT001, FBT002
    dataset_format: Literal["array", "dataframe"] = "dataframe",
    n_jobs: int | None = None,
) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]:
    """Run the model on the dataset defined by the task.

    Parameters
    ----------
    model : sklearn model
        A model which has a function fit(X,Y) and predict(X),
        all supervised estimators of scikit learn follow this definition of a model
        (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
    task : OpenMLTask or int or str
        Task to perform or Task id.
        This may be a model instead if the first argument is an OpenMLTask.
    avoid_duplicate_runs : bool, optional (default=True)
        If True, the run will throw an error if the setup/task combination is already present on
        the server. This feature requires an internet connection.
    flow_tags : List[str], optional (default=None)
        A list of tags that the flow should have at creation.
    seed: int, optional (default=None)
        Models that are not seeded will get this seed.
    add_local_measures : bool, optional (default=True)
        Determines whether to calculate a set of evaluation measures locally,
        to later verify server behaviour.
    upload_flow : bool (default=False)
        If True, upload the flow to OpenML if it does not exist yet.
        If False, do not upload the flow to OpenML.
    return_flow : bool (default=False)
        If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun.
    dataset_format : str (default='dataframe')
        If 'array', the dataset is passed to the model as a numpy array.
        If 'dataframe', the dataset is passed to the model as a pandas dataframe.
    n_jobs : int (default=None)
        The number of processes/threads to distribute the evaluation asynchronously.
        If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
        If `-1`, then the job uses as many cores available.

    Returns
    -------
    run : OpenMLRun
        Result of the run.
    flow : OpenMLFlow (optional, only if `return_flow` is True).
        Flow generated from the model.
    """
    if avoid_duplicate_runs and not config.apikey:
        warnings.warn(
            "avoid_duplicate_runs is set to True, but no API key is set. "
            "Please set your API key in the OpenML configuration file, see"
            "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
            ".html#authentication for more information on authentication.",
            RuntimeWarning,
            stacklevel=2,
        )

    # TODO: At some point in the future do not allow for arguments in old order (6-2018).
    # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
    # When removing this please also remove the method `is_estimator` from the extension
    # interface as it is only used here (MF, 3-2019)
    if isinstance(model, (int, str, OpenMLTask)):
        warnings.warn(
            "The old argument order (task, model) is deprecated and "
            "will not be supported in the future. Please use the "
            "order (model, task).",
            DeprecationWarning,
            stacklevel=2,
        )
        task, model = model, task

    extension = get_extension_by_model(model, raise_if_no_extension=True)
    if extension is None:
        # This should never happen and is only here to please mypy will be gone soon once the
        # whole function is removed
        raise TypeError(extension)

    flow = extension.model_to_flow(model)

    def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
        """Retrieve an OpenMLTask object from either an integer or string ID,
        or directly from an OpenMLTask object.

        Parameters
        ----------
        _task : Union[int, str, OpenMLTask]
            The task ID or the OpenMLTask object.

        Returns
        -------
        OpenMLTask
            The OpenMLTask object.
        """
        if isinstance(_task, (int, str)):
            return get_task(int(_task))  # type: ignore

        return _task

    task = get_task_and_type_conversion(task)

    run = run_flow_on_task(
        task=task,
        flow=flow,
        avoid_duplicate_runs=avoid_duplicate_runs,
        flow_tags=flow_tags,
        seed=seed,
        add_local_measures=add_local_measures,
        upload_flow=upload_flow,
        dataset_format=dataset_format,
        n_jobs=n_jobs,
    )
    if return_flow:
        return run, flow
    return run