Multiple-kernel ridge refining

This example demonstrates how to solve multiple-kernel ridge regression with hyperparameter random search, then refine the results with hyperparameter gradient descent.

import numpy as np

from himalaya.backend import set_backend
from himalaya.kernel_ridge import MultipleKernelRidgeCV
from himalaya.kernel_ridge import Kernelizer
from himalaya.kernel_ridge import ColumnKernelizer
from himalaya.utils import generate_multikernel_dataset

from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

In this example, we use the cupy backend (GPU).

backend = set_backend("cupy", on_error="warn")

Generate a random dataset

  • X_train : array of shape (n_samples_train, n_features)

  • X_test : array of shape (n_samples_test, n_features)

  • Y_train : array of shape (n_samples_train, n_targets)

  • Y_test : array of shape (n_samples_test, n_targets)

(X_train, X_test, Y_train, Y_test, kernel_weights,
 n_features_list) = generate_multikernel_dataset(n_kernels=4, n_targets=50,
                                                 n_samples_train=600,
                                                 n_samples_test=300,
                                                 random_state=42)

feature_names = [f"Feature space {ii}" for ii in range(len(n_features_list))]

Prepare the pipeline

# Find the start and end of each feature space X in Xs
start_and_end = np.concatenate([[0], np.cumsum(n_features_list)])
slices = [
    slice(start, end)
    for start, end in zip(start_and_end[:-1], start_and_end[1:])
]

# Create a different ``Kernelizer`` for each feature space.
kernelizers = [("space %d" % ii, Kernelizer(), slice_)
               for ii, slice_ in enumerate(slices)]
column_kernelizer = ColumnKernelizer(kernelizers)

Define the random-search model

We use very few iteration on purpose, to make the random search suboptimal, and refine it with hyperparameter gradient descent.

solver_params = dict(n_iter=5, alphas=np.logspace(-10, 10, 41))

model_1 = MultipleKernelRidgeCV(kernels="precomputed", solver="random_search",
                                solver_params=solver_params, random_state=42)
pipe_1 = make_pipeline(column_kernelizer, model_1)

# Fit the model on all targets
pipe_1.fit(X_train, Y_train)

Out:

[                                        ] 0% | 0.00 sec | 5 random sampling with cv |
[........                                ] 20% | 0.17 sec | 5 random sampling with cv |
[................                        ] 40% | 0.35 sec | 5 random sampling with cv |
[........................                ] 60% | 0.52 sec | 5 random sampling with cv |
[................................        ] 80% | 0.69 sec | 5 random sampling with cv |
[........................................] 100% | 0.87 sec | 5 random sampling with cv |
Pipeline(steps=[('columnkernelizer',
                 ColumnKernelizer(transformers=[('space 0', Kernelizer(),
                                                 slice(0, 1000, None)),
                                                ('space 1', Kernelizer(),
                                                 slice(1000, 2000, None)),
                                                ('space 2', Kernelizer(),
                                                 slice(2000, 3000, None)),
                                                ('space 3', Kernelizer(),
                                                 slice(3000, 4000, None))])),
                ('multiplekernelridgecv',
                 MultipleKernelRidgeCV(kernels='precomputed', random_state=42,
                                       solver_params=...
       1.00000000e-02, 3.16227766e-02, 1.00000000e-01, 3.16227766e-01,
       1.00000000e+00, 3.16227766e+00, 1.00000000e+01, 3.16227766e+01,
       1.00000000e+02, 3.16227766e+02, 1.00000000e+03, 3.16227766e+03,
       1.00000000e+04, 3.16227766e+04, 1.00000000e+05, 3.16227766e+05,
       1.00000000e+06, 3.16227766e+06, 1.00000000e+07, 3.16227766e+07,
       1.00000000e+08, 3.16227766e+08, 1.00000000e+09, 3.16227766e+09,
       1.00000000e+10]),
                                                      'n_iter': 5}))])
Please rerun this cell to show the HTML repr or trust the notebook.


Define the gradient-descent model

solver_params = dict(max_iter=10, hyper_gradient_method="direct",
                     max_iter_inner_hyper=10,
                     initial_deltas="here_will_go_the_previous_deltas")

model_2 = MultipleKernelRidgeCV(kernels="precomputed", solver="hyper_gradient",
                                solver_params=solver_params)
pipe_2 = make_pipeline(column_kernelizer, model_2)

Use the random-search to initialize the gradient-descent

# We might want to refine only the best predicting targets, since the
# hyperparameter gradient descent is less efficient over many targets.
top = 60  # top 60%
best_cv_scores = backend.to_numpy(pipe_1[-1].cv_scores_.max(0))
mask = best_cv_scores > np.percentile(best_cv_scores, 100 - top)

pipe_2[-1].solver_params['initial_deltas'] = pipe_1[-1].deltas_[:, mask]
pipe_2.fit(X_train, Y_train[:, mask])

Out:

[                                        ] 0% | 0.00 sec | hypergradient_direct |
[                                        ] 0% | 0.00 sec | hypergradient_direct |
[....                                    ] 10% | 0.35 sec | hypergradient_direct |
[........                                ] 20% | 0.45 sec | hypergradient_direct |
[............                            ] 30% | 0.54 sec | hypergradient_direct |
[................                        ] 40% | 0.64 sec | hypergradient_direct |
[....................                    ] 50% | 0.74 sec | hypergradient_direct |
[........................                ] 60% | 0.83 sec | hypergradient_direct |
[............................            ] 70% | 0.93 sec | hypergradient_direct |
[................................        ] 80% | 1.03 sec | hypergradient_direct |
[....................................    ] 90% | 1.13 sec | hypergradient_direct |
[........................................] 100% | 1.34 sec | hypergradient_direct |
Pipeline(steps=[('columnkernelizer',
                 ColumnKernelizer(transformers=[('space 0', Kernelizer(),
                                                 slice(0, 1000, None)),
                                                ('space 1', Kernelizer(),
                                                 slice(1000, 2000, None)),
                                                ('space 2', Kernelizer(),
                                                 slice(2000, 3000, None)),
                                                ('space 3', Kernelizer(),
                                                 slice(3000, 4000, None))])),
                ('multiplekernelridgecv',
                 MultipleKernelRidgeCV(kernels='precomputed',
                                       solver='hyper_gradient',
                                       solver...
       [ 21.57146   , -21.007488  ,  -4.9082675 ,  -3.738099  ,
          9.1097975 ,  -7.008298  ,  20.622723  ,  -7.008298  ,
         20.622723  ,  -5.857005  ,  21.639557  , -19.856195  ,
         21.57146   ,  21.57146   , -21.007488  ,  21.639557  ,
         -7.008298  ,  21.57146   ,  21.57146   , -13.180669  ,
         -1.45439   ,   6.807213  ,  10.058536  ,  10.058536  ,
         15.601645  ,  -3.7569752 , -22.15878   ,  21.57146   ,
         21.57146   ,   6.6046576 ]], dtype=float32),
                                                      'max_iter': 10,
                                                      'max_iter_inner_hyper': 10}))])
Please rerun this cell to show the HTML repr or trust the notebook.


Compute predictions on a test set

import matplotlib.pyplot as plt

# use the first model for all targets
test_scores_1 = pipe_1.score(X_test, Y_test)

# use the second model for the refined targets
test_scores_2 = backend.copy(test_scores_1)
test_scores_2[mask] = pipe_2.score(X_test, Y_test[:, mask])

test_scores_1 = backend.to_numpy(test_scores_1)
test_scores_2 = backend.to_numpy(test_scores_2)
plt.figure(figsize=(4, 4))
plt.scatter(test_scores_1, test_scores_2, alpha=0.3)
plt.xlim(0, 1)
plt.plot(plt.xlim(), plt.xlim(), color='k', lw=1)
plt.xlabel(r"Base model")
plt.ylabel(r"Refined model")
plt.title("$R^2$ generalization score")
plt.grid()
plt.tight_layout()
plt.show()
$R^2$ generalization score

Total running time of the script: ( 0 minutes 2.369 seconds)

Gallery generated by Sphinx-Gallery