目的:预测不同商店销售额
主要步骤:数据处理,特征工程,模型训练,模型评估,模型选择,预测,不足改善
#!pip install wbdata -q
import numpy as np
import pandas as pd
from datetime import datetime,date
import holidays
import wbdata
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score #计算回归模型的性能的指标
from sklearn.model_selection import train_test_split #用于划分训练集和测试集
import xgboost as xgb #导入梯度提升树xgboost
import lightgbm as lgb #导入集成学习的算法lightgbm
import warnings #避免一些可以忽略的报错
warnings.filterwarnings('ignore') #filterwarnings()方法是用于设置警告过滤器的方法,它可以控制警告信息的输出方式和级别.
# 1.数据准备
# 1.1读取数据
train_df=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv").drop('id',axis=1) #训练集,含date,store_nbr,family,sales,onpromotion字段
stores_df=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv") #与store_nbr关联的含city,state,type,cluster四个信息
holidays_events_df=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")#实际节假日信息
print("train_df:",len(train_df),set(train_df.columns),)
print("stores_df:",len(stores_df),set(stores_df.columns),)
train_df.head()
holidays_events_df.query('date=="2012-10-09"| date== "2012-10-12"')
# 1.2特征工程
def f_season(m):
if m in [12,1,2]:
return 0
elif m in [3,4,5] :
return 1
elif m in[6,7,8]:
return 2
else:
return 3
def dataHandle(df):
# 1.2.1 date字段处理
df['date'] = pd.to_datetime(df['date'])
df['saturday'] = df.date.dt.weekday.eq(5).astype("int8")
df['sunday'] = df.date.dt.weekday.eq(6).astype("int8")
df['day_of_week'] = df.date.dt.day_of_week.astype("int8")
df['day_of_month'] = df.date.dt.day.astype("int8")
df['week_of_month'] = (df.date.apply(lambda d: (d.day-1) // 7 + 1)).astype("int8")
df['week_of_year'] = (df.date.dt.weekofyear).astype("int8")
df['month'] = df.date.dt.month.astype("int8")
df['season'] = df["month"].map(f_season) # 0: Winter - 1: Spring - 2: Summer - 3: Fall
df['year'] = df.date.dt.year.astype("int32")
df['week_of_month'] = (df.date.apply(lambda d: (d.day-1) // 7 + 1)).astype("int8")
df['week_of_year'] = (df.date.dt.weekofyear).astype("int8")
df['day_of_week'] = (df.date.dt.dayofweek + 1).astype("int8")
df['month_sin'] = np.sin(2*np.pi*df.month/12)
df['month_cos'] = np.cos(2*np.pi*df.month/12)
df['day_sin'] = np.sin(2*np.pi*df.day_of_month/31)
df['day_cos'] = np.cos(2*np.pi*df.day_of_month/31)
# 1.2.2 holiday识别
is_holiday=holidays.Ecuador()
df['is_holiday'] = df['date'].apply(lambda x: int(x in is_holiday))
# 1.2.3 获取厄瓜多尔的经济状况
infl_data, unem_data = pd.DataFrame(), pd.DataFrame()
data = wbdata.get_data("SL.UEM.TOTL.ZS", country='EC') #从世界银行获取厄瓜多尔的失业率数据,json文件
data_unem = pd.json_normalize(data) #将json文件转成规范化的DataFrame对象
unem_data = unem_data.append(data_unem)
data = wbdata.get_data("FP.CPI.TOTL.ZG", country='EC') # 从世界银行获取厄瓜多尔的消费数据,json文件
data_infl = pd.json_normalize(data)
infl_data = infl_data.append(data_infl)
#按照列重命名
unem_data = unem_data.rename(columns={'value': 'unemployment'})
infl_data = infl_data.rename(columns={'value': 'inflation'})
#将数据筛选出以下2列 (一年一个数据)
unem_data = unem_data[['date', 'unemployment']]
infl_data = infl_data[['date', 'inflation']]
df['date']=df['date'].dt.year.astype(str)
#合并数据
df = pd.merge(df,infl_data, on='date')
df = pd.merge(df,unem_data, on='date')
df.drop(['date'],axis=1,inplace=True)
# 1.2.4 将stores表信息关联进来 city,state,type,cluster
df = pd.merge(df, stores_df, how='left', on=['store_nbr'])
return df
train_df = dataHandle(train_df)
#1.2.4 离散型变量处理
categorical_cols = ['family', 'store_nbr', 'city', 'state', 'type', 'cluster']
onehot_df = pd.get_dummies(train_df[categorical_cols])
train_df = pd.concat([train_df,onehot_df],axis=1)
train_df.drop(categorical_cols, axis=1, inplace=True)
#1.3 训练数据与验证数据准备
X = train_df.drop('sales', axis=1)
y = train_df['sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 2.模型选择
model = xgb.XGBRegressor(
n_estimators=300, #评估器的数量
learning_rate=0.1, #学习率
max_depth=23, #输的最大深度
subsample=0.8, #子采样率(在训练样本中使用多少样本)
colsample_bytree=0.8, #列采样率(在特征中使用多少特征训练)
reg_alpha=0.01, #L1正则化的参数
reg_lambda=1, #L2正则化的参数
random_state=42 #设置随机种子
)
# 3.模型实例化与评估
%time
model.fit(X_train, y_train) #模型的训练和预测
y_pred = model.predict(X_test)
#模型的评估指标
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse},R-squared Score: {r2}")
#数据可视化
plt.figure(figsize=(10, 6))#画布的大小
plt.plot(np.arange(len(y_test)), y_test, label='Actual Trend')
plt.plot(np.arange(len(y_test)), y_pred, label='Predicted Trend')
plt.xlabel('Data Index')
plt.ylabel('Trend')
plt.legend()
plt.show()
# 4.预测
#对测试数据做同样的处理
test_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv").drop('id',axis=1)
test_df = dataHandle(test_df)
onehot_df = pd.get_dummies(test_df[categorical_cols])
test_df = pd.concat([test_df,onehot_df],axis=1)
test_df.drop(categorical_cols, axis=1, inplace=True)
print(train_df.shape)
print(test_df.shape)
#对测试数据的预测
test_predictions = model.predict(test_df)
sub=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv")
sub['sales']=test_predictions
sub.to_csv("submission.csv",index=False)