
- import numpy as np
- import pandas as pd
- from sklearn.datasets import load_boston
- from sklearn.preprocessing import StandardScaler
- from sklearn.pipeline import Pipeline
- from sklearn.impute import SimpleImputer
- from sklearn.neighbors import KNeighborsRegressor
- from sklearn.model_selection import cross_validate, train_test_split
- from sklearn.metrics import mean_squared_error
- #Importing the dataset
- data = pd.DataFrame(load_boston()['data'],columns=load_boston()['feature_names'])
- data['target'] = load_boston()['target']
- #Split the input and target features
- X = data.iloc[:,:-1].copy()
- y = data.iloc[:,-1].copy()
- # Adding 100 random missing values
- np.random.seed(11)
- rand_cols = np.random.randint(0,X.shape[1],100)
- rand_rows = np.random.randint(0,X.shape[0],100)
- for i,j in zip(rand_rows,rand_cols):
- X.iloc[i,j] = np.nan
- #Splitting the data into training and test sets
- X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=11)
- #Initislizing KNN Regressor
- knn = KNeighborsRegressor()
- #Initializing mode imputer
- imp = SimpleImputer(strategy='most_frequent')
- #Initializing StandardScaler
- standard_scaler = StandardScaler()
- #Imputing and scaling X_train
- X_train_impute = imp.fit_transform(X_train).copy()
- X_train_scaled = standard_scaler.fit_transform(X_train_impute).copy()
- #Running 5-fold cross-validation
- cv = cross_validate(estimator=knn,X=X_train_scaled,y=y_train,cv=5,scoring="neg_root_mean_squared_error",return_train_score=True)
- #Calculating mean of the training scores of cross-validation
- print(f'Training RMSE (with data leakage): {-1 * np.mean(cv["train_score"])}')
- #Calculating mean of the validation scores of cross-validation
- print(f'validation RMSE (with data leakage): {-1 * np.mean(cv["test_score"])}')
- #fitting the model to the training data
- lr.fit(X_train_scaled,y_train)
- #preprocessing the test data
- X_test_impute = imp.transform(X_test).copy()
- X_test_scaled = standard_scaler.transform(X_test_impute).copy()
- #Predictions and model evaluation on unseen data
- pred = lr.predict(X_test_scaled)
- print(f'RMSE on unseen data: {np.sqrt(mean_squared_error(y_test,pred))}')

在上面的代码中,‘Xtrain’是训练集(k-fold交叉验证),‘Xtest’用于对看不见的数据进行模型评估。上面的代码是一个带有数据泄漏的模型评估示例,其中,用于估算缺失值的模式(strategy= ' mostfrequent ')在' Xtrain '上计算。类似地,用于缩放数据的均值和标准偏差也使用' Xtrain '计算。' Xtrain的缺失值将被输入,' X_train '在k-fold交叉验证之前进行缩放。
在k-fold交叉验证中,' Xtrain '被分割成' k '折叠。在每次k-fold交叉验证迭代中,其中一个折用于验证(我们称其为验证部分),其余的折用于训练(我们称其为训练部分)。每次迭代中的训练和验证部分都有已经使用' Xtrain '计算的模式输入的缺失值。类似地,它们已经使用在' Xtrain '上计算的平均值和标准偏差进行了缩放。这种估算和缩放操作会导致来自' Xtrain '的信息泄露到k-fold交叉验证的训练和验证部分。这种信息泄漏可能导致模型在验证部分上的性能估计有偏差。下面的代码展示了一种通过使用管道来避免它的方法。
- #Preprocessing and regressor pipeline
- pipeline = Pipeline(steps=[['imputer',imp],['scaler',standard_scaler],['regressor',knn]])
- #Running 5-fold cross-validation using pipeline as estimator
- cv = cross_validate(estimator=pipeline,X=X_train,y=y_train,cv=5,scoring="neg_root_mean_squared_error",return_train_score=True)
- #Calculating mean of the training scores of cross-validation
- print(f'Training RMSE (without data leakage): {-1 * np.mean(cv["train_score"])}')
- #Calculating mean of the validation scores of cross-validation
- print(f'validation RMSE (without data leakage): {-1 * np.mean(cv["test_score"])}')
- #fitting the pipeline to the training data
- pipeline.fit(X_train,y_train)
- #Predictions and model evaluation on unseen data
- pred = pipeline.predict(X_test)
- print(f'RMSE on unseen data: {np.sqrt(mean_squared_error(y_test,pred))}')
在上面的代码中,我们已经在管道中包含了输入器、标量和回归器。在本例中,' X_train '被分割为5个折,在每次迭代中,管道使用训练部分计算用于输入训练和验证部分中缺失值的模式。同样,用于衡量训练和验证部分的平均值和标准偏差也在训练部分上计算。这一过程消除了数据泄漏,因为在每次k-fold交叉验证迭代中,都在训练部分计算归责模式和缩放的均值和标准偏差。在每次k-fold交叉验证迭代中,这些值用于计算和扩展训练和验证部分。