导入

import os
import tempfile

import lightgbm as lgb
import optuna
import pandas as pd
from datasetsforecast.m4 import M4, M4Evaluation, M4Info
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from utilsforecast.plotting import plot_series

from mlforecast import MLForecast
from mlforecast.auto import (
    AutoLightGBM,
    AutoMLForecast,
    AutoModel,
    AutoRidge,
    ridge_space,
)
from mlforecast.lag_transforms import ExponentiallyWeightedMean, RollingMean

数据设置

def get_data(group, horizon):
    df, *_ = M4.load(directory='data', group=group)
    df['ds'] = df['ds'].astype('int')
    df['unique_id'] = df['unique_id'].astype('category')
    return df.groupby('unique_id').head(-horizon).copy()

group = 'Hourly'
horizon = M4Info[group].horizon
train = get_data(group, horizon)

优化

默认优化

我们为某些模型提供了默认搜索空间,并且可以根据数据的季节周期长度定义要寻找的默认特征。在此示例中,我们将使用每小时数据,并将其季节长度设置为 24(一天)。

optuna.logging.set_verbosity(optuna.logging.ERROR)
auto_mlf = AutoMLForecast(
    models={'lgb': AutoLightGBM(), 'ridge': AutoRidge()},
    freq=1,
    season_length=24,
)
auto_mlf.fit(
    train,
    n_windows=2,
    h=horizon,
    num_samples=2,  # number of trials to run
)
AutoMLForecast(models={'lgb': AutoModel(model=LGBMRegressor), 'ridge': AutoModel(model=Ridge)})

我们现在可以使用这些模型进行预测

preds = auto_mlf.predict(horizon)
preds.head()
unique_iddslgbridge
0H1701680.534943604.140123
1H1702599.038307523.364874
2H1703572.808421479.174481
3H1704564.573783444.540062
4H1705543.046026419.987657

并评估它们

def evaluate(df, group):
    results = []
    for model in df.columns.drop(['unique_id', 'ds']):
        model_res = M4Evaluation.evaluate(
            'data', group, df[model].to_numpy().reshape(-1, horizon)
        )
        model_res.index = [model]
        results.append(model_res)
    return pd.concat(results).T.round(2)

evaluate(preds, group)
lgbridge
SMAPE18.7820.00
MASE5.071.29
OWA1.570.81

调整模型参数

你可以提供自己的模型及其搜索空间来执行优化。搜索空间应该是一个函数,它接受一个 optuna trial 并返回模型参数。

def my_lgb_config(trial: optuna.Trial):
    return {
        'learning_rate': 0.05,
        'verbosity': -1,
        'num_leaves': trial.suggest_int('num_leaves', 2, 128, log=True),
        'objective': trial.suggest_categorical('objective', ['l1', 'l2', 'mape']),
    }

my_lgb = AutoModel(
    model=lgb.LGBMRegressor(),
    config=my_lgb_config,
)
auto_mlf = AutoMLForecast(
    models={'my_lgb': my_lgb},
    freq=1,
    season_length=24,
).fit(
    train,
    n_windows=2,
    h=horizon,
    num_samples=2,
)
preds = auto_mlf.predict(horizon)
evaluate(preds, group)
my_lgb
SMAPE18.67
MASE4.79
OWA1.51

调整 scikit-learn 管道

我们在内部为每个配置使用 BaseEstimator.set_params,因此如果你使用的是 scikit-learn 管道,你可以像使用 scikit-learn 的搜索方法那样调整其参数。

ridge_pipeline = make_pipeline(
    ColumnTransformer(
        [('encoder', OneHotEncoder(), ['unique_id'])],
        remainder='passthrough',
    ),
    Ridge()
)
my_auto_ridge = AutoModel(
    ridge_pipeline,
    # the space must have the name of the estimator followed by the parameter
    # you could also tune the encoder here
    lambda trial: {f'ridge__{k}': v for k, v in ridge_space(trial).items()},
)
auto_mlf = AutoMLForecast(
    models={'ridge': my_auto_ridge},
    freq=1,
    season_length=24,
    fit_config=lambda trial: {'static_features': ['unique_id']}
).fit(
    train,
    n_windows=2,
    h=horizon,
    num_samples=2,
)
preds = auto_mlf.predict(horizon)
evaluate(preds, group)
ridge
SMAPE18.50
MASE1.24
OWA0.76

调整特征

MLForecast 类在其构造函数中定义了要构建的特征。你可以通过 init_config 参数提供一个函数来调整特征,该函数将接受一个 optuna trial 并生成一个配置传递给 MLForecast 构造函数。

def my_init_config(trial: optuna.Trial):
    lag_transforms = [
        ExponentiallyWeightedMean(alpha=0.3),
        RollingMean(window_size=24 * 7, min_samples=1),
    ]
    lag_to_transform = trial.suggest_categorical('lag_to_transform', [24, 48])
    return {
        'lags': [24 * i for i in range(1, 7)],  # this won't be tuned
        'lag_transforms': {lag_to_transform: lag_transforms},
    }

auto_mlf = AutoMLForecast(
    models=[AutoRidge()],
    freq=1,
    season_length=24,
    init_config=my_init_config,
).fit(
    train,
    n_windows=2,
    h=horizon,
    num_samples=2,
)
preds = auto_mlf.predict(horizon)
evaluate(preds, group)
AutoRidge
SMAPE13.31
MASE1.67
OWA0.71

调整 fit 参数

MLForecast.fit 方法接受一些可以提高模型预测性能的参数,例如 dropnastatic_features。如果你想调整这些参数,可以向 fit_config 参数提供一个函数。

def my_fit_config(trial: optuna.Trial):
    if trial.suggest_int('use_id', 0, 1):
        static_features = ['unique_id']
    else:
        static_features = None
    return {
        'static_features': static_features
    }

auto_mlf = AutoMLForecast(
    models=[AutoLightGBM()],
    freq=1,
    season_length=24,
    fit_config=my_fit_config,
).fit(
    train,
    n_windows=2,
    h=horizon,
    num_samples=2,
)
preds = auto_mlf.predict(horizon)
evaluate(preds, group)
AutoLightGBM
SMAPE18.78
MASE5.07
OWA1.57

访问优化结果

过程完成后,结果可在 AutoMLForecast 对象的 results_ 属性下找到。每个模型会有一个结果,最佳配置可在 config 用户属性下找到。

len(auto_mlf.results_)
1
auto_mlf.results_['AutoLightGBM'].best_trial.user_attrs['config']
{'model_params': {'bagging_freq': 1,
  'learning_rate': 0.05,
  'verbosity': -1,
  'n_estimators': 169,
  'lambda_l1': 0.027334069690310565,
  'lambda_l2': 0.0026599310838681858,
  'num_leaves': 112,
  'feature_fraction': 0.7118273996694524,
  'bagging_fraction': 0.8229470565333281,
  'objective': 'l2'},
 'mlf_init_params': {'lags': [48],
  'target_transforms': None,
  'lag_transforms': {1: [ExponentiallyWeightedMean(alpha=0.9)]},
  'date_features': None,
  'num_threads': 1},
 'mlf_fit_params': {'static_features': None}}

单个模型

每个模型有一个优化过程。这是因为不同的模型可以使用不同的特征。因此,在完成每个模型的优化过程后,将使用最佳配置使用所有数据重新训练模型。这些最终模型是 MLForecast 对象,并保存在 models_ 属性中。

auto_mlf.models_
{'AutoLightGBM': MLForecast(models=[AutoLightGBM], freq=1, lag_features=['lag48', 'exponentially_weighted_mean_lag1_alpha0.9'], date_features=[], num_threads=1)}

保存

你可以使用 AutoMLForecast.save 方法保存找到的最佳模型。这将为每个模型生成一个目录。

with tempfile.TemporaryDirectory() as tmpdir:
    auto_mlf.save(tmpdir)
    print(os.listdir(tmpdir))
['AutoLightGBM']

由于每个模型都是一个 MLForecast 对象,你可以单独加载它。

with tempfile.TemporaryDirectory() as tmpdir:
    auto_mlf.save(tmpdir)
    loaded = MLForecast.load(f'{tmpdir}/AutoLightGBM')
    print(loaded)
MLForecast(models=[AutoLightGBM], freq=1, lag_features=['lag48', 'exponentially_weighted_mean_lag1_alpha0.9'], date_features=[], num_threads=1)