源文件

mlforecast_objective

 mlforecast_objective
                       (df:Union[pandas.core.frame.DataFrame,polars.datafr
                       ame.frame.DataFrame], config_fn:Callable[[optuna.tr
                       ial._trial.Trial],Dict[str,Any]], loss:Callable,
                       model:sklearn.base.BaseEstimator,
                       freq:Union[int,str], n_windows:int, h:int,
                       step_size:Optional[int]=None,
                       input_size:Optional[int]=None,
                       refit:Union[bool,int]=False,
                       id_col:str='unique_id', time_col:str='ds',
                       target_col:str='y')

MLForecast 类的 optuna 目标函数

类型默认值详情
df联合
config_fn可调用对象接受一个 optuna trial 并生成包含以下键的配置的函数
- model_params
- mlf_init_params
- mlf_fit_params
loss可调用对象接受验证和训练数据帧并生成一个浮点数的函数。
modelBaseEstimator用于训练的 scikit-learn 兼容模型
freq联合pandas 或 polars 的 offset 别名或表示序列频率的整数。
n_windows整数评估的窗口数量。
h整数预测范围。
step_size可选None每个交叉验证窗口之间的步长。如果为 None,则等于 h
input_size可选None每个窗口中每个序列的最大训练样本数。如果为 None,将使用 expanding window(扩展窗口)。
refit联合False为每个交叉验证窗口重新训练模型。
如果为 False,模型在开始时训练一次,然后用于预测每个窗口。
如果为正整数,模型每 refit 个窗口重新训练一次。
id_col字符串unique_id标识每个序列的列。
time_col字符串ds标识每个时间步的列,其值可以是时间戳或整数。
target_col字符串y包含目标值的列。
返回值可调用对象
import lightgbm as lgb
from datasetsforecast.m4 import M4, M4Evaluation, M4Info
from utilsforecast.losses import smape

from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences, LocalBoxCox, LocalStandardScaler
def train_valid_split(group):
    df, *_ = M4.load(directory='data', group=group)
    df['ds'] = df['ds'].astype('int')
    horizon = M4Info[group].horizon
    valid = df.groupby('unique_id').tail(horizon)
    train = df.drop(valid.index)
    return train, valid
h = M4Info['Weekly'].horizon
weekly_train, weekly_valid = train_valid_split('Weekly')
weekly_train['unique_id'] = weekly_train['unique_id'].astype('category')
weekly_valid['unique_id'] = weekly_valid['unique_id'].astype(weekly_train['unique_id'].dtype)
def config_fn(trial):
    candidate_lags = [
        [1],
        [13],
        [1, 13],
        range(1, 33),
    ]
    lag_idx = trial.suggest_categorical('lag_idx', range(len(candidate_lags)))
    candidate_lag_tfms = [
        {
            1: [RollingMean(window_size=13)]
        },
        {
            1: [RollingMean(window_size=13)],
            13: [RollingMean(window_size=13)],
        },
        {
            13: [RollingMean(window_size=13)],
        },
        {
            4: [ExpandingMean(), RollingMean(window_size=4)],
            8: [ExpandingMean(), RollingMean(window_size=4)],
        }
    ]
    lag_tfms_idx = trial.suggest_categorical('lag_tfms_idx', range(len(candidate_lag_tfms)))
    candidate_targ_tfms = [
        [Differences([1])],
        [LocalBoxCox()],
        [LocalStandardScaler()],        
        [LocalBoxCox(), Differences([1])],
        [LocalBoxCox(), LocalStandardScaler()],
        [LocalBoxCox(), Differences([1]), LocalStandardScaler()],
    ]
    targ_tfms_idx = trial.suggest_categorical('targ_tfms_idx', range(len(candidate_targ_tfms)))
    return {
        'model_params': {
            'learning_rate': 0.05,
            'objective': 'l1',
            'bagging_freq': 1,
            'num_threads': 2,
            'verbose': -1,
            'force_col_wise': True,
            'n_estimators': trial.suggest_int('n_estimators', 10, 1000, log=True),            
            'num_leaves': trial.suggest_int('num_leaves', 31, 1024, log=True),
            'lambda_l1': trial.suggest_float('lambda_l1', 0.01, 10, log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 0.01, 10, log=True),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.75, 1.0),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.75, 1.0),
        },
        'mlf_init_params': {
            'lags': candidate_lags[lag_idx],
            'lag_transforms': candidate_lag_tfms[lag_tfms_idx],
            'target_transforms': candidate_targ_tfms[targ_tfms_idx],
        },
        'mlf_fit_params': {
            'static_features': ['unique_id'],
        }
    }

def loss(df, train_df):
    return smape(df, models=['model'])['model'].mean()
optuna.logging.set_verbosity(optuna.logging.WARNING)
objective = mlforecast_objective(
    df=weekly_train,
    config_fn=config_fn,
    loss=loss,    
    model=lgb.LGBMRegressor(),
    freq=1,
    n_windows=2,
    h=h,
)
study = optuna.create_study(
    direction='minimize', sampler=optuna.samplers.TPESampler(seed=0)
)
study.optimize(objective, n_trials=2)
best_cfg = study.best_trial.user_attrs['config']
final_model = MLForecast(
    models=[lgb.LGBMRegressor(**best_cfg['model_params'])],
    freq=1,
    **best_cfg['mlf_init_params'],
)
final_model.fit(weekly_train, **best_cfg['mlf_fit_params'])
preds = final_model.predict(h)
M4Evaluation.evaluate('data', 'Weekly', preds['LGBMRegressor'].values.reshape(-1, 13))
SMAPEMASEOWA
每周9.2615382.6144730.976158