使用逻辑回归完成社交网络广告推荐的预测
内容概要
1 导入数据,了解数据基本信息
2 探索标签和特征分布状况
3 数据预处理
4 初步建模,建立benchmark
5 模型调优
5.1 数据量纲统一
5.2 使用交叉验证选取最优参数
5.3 改变样本权重
6 得到最终模型
1 导入数据,了解数据基本信息
# 导入所需模块和包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,auc
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('ggplot') # 更改绘图风格,R语言绘图库的风格
plt.rcParams['font.sans-serif']='SimHei' ##设置中文显示
# 导入原始数据集
df = pd.read_csv("data/Social_Network_Ads.csv")
df.head()
User ID | Gender | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|---|
0 | 15624510 | Male | 19 | 19000 | 0 |
1 | 15810944 | Male | 35 | 20000 | 0 |
2 | 15668575 | Female | 26 | 43000 | 0 |
3 | 15603246 | Female | 27 | 57000 | 0 |
4 | 15804002 | Male | 19 | 76000 | 0 |
# 查看数据基本信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User ID 400 non-null int64
1 Gender 400 non-null object
2 Age 400 non-null int64
3 EstimatedSalary 400 non-null int64
4 Purchased 400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.8+ KB
df.duplicated().sum() # 查看数据是否含有缺失值
0
len(np.unique(df["User ID"])) == df.shape[0] # 查看用户ID是否有重复值
True
2 探索标签和特征分布状况
df.describe()
User ID | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|
count | 4.000000e+02 | 400.000000 | 400.000000 | 400.000000 |
mean | 1.569154e+07 | 37.655000 | 69742.500000 | 0.357500 |
std | 7.165832e+04 | 10.482877 | 34096.960282 | 0.479864 |
min | 1.556669e+07 | 18.000000 | 15000.000000 | 0.000000 |
25% | 1.562676e+07 | 29.750000 | 43000.000000 | 0.000000 |
50% | 1.569434e+07 | 37.000000 | 70000.000000 | 0.000000 |
75% | 1.575036e+07 | 46.000000 | 88000.000000 | 1.000000 |
max | 1.581524e+07 | 60.000000 | 150000.000000 | 1.000000 |
df["Purchased"].value_counts()
0 257
1 143
Name: Purchased, dtype: int64
df["Purchased"].value_counts().plot(kind='pie',autopct='%.2f%%');
# 查看性别分布状况
df["Gender"].value_counts().plot(kind="bar");
# 不同性别的用户中有购买意向和没有购买意向的分布状况
data = df.groupby(by=["Gender","Purchased"],as_index=False).size()
data = data.pivot(index="Gender",columns="Purchased",values="size")
data.plot(kind="bar",stacked=True);
plt.figure(figsize=(12,4))
plt.subplot(121)
df["Age"].plot(kind="hist") # 查看年龄分布状况
plt.scatter(df["Age"],df["Purchased"]*100,color="red",s=10,alpha=0.2);
plt.title("年龄分布状况")
plt.subplot(122)
df["EstimatedSalary"].plot(kind="hist") # 查看收入状况
plt.scatter(df["EstimatedSalary"],df["Purchased"]*100,color="red",s=10,alpha=0.2);
plt.title("收入分布状况")
Text(0.5, 1.0, '收入分布状况')
3 数据预处理
# 将文本型数据编码
df["Gender"] = df["Gender"].map({"Male":0,"Female":1})
df.head()
User ID | Gender | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|---|
0 | 15624510 | 0 | 19 | 19000 | 0 |
1 | 15810944 | 0 | 35 | 20000 | 0 |
2 | 15668575 | 1 | 26 | 43000 | 0 |
3 | 15603246 | 1 | 27 | 57000 | 0 |
4 | 15804002 | 0 | 19 | 76000 | 0 |
4 初步建模,建立benchmark
# 提取特征矩阵和标签
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
X.head()
Gender | Age | EstimatedSalary | |
---|---|---|---|
0 | 0 | 19 | 19000 |
1 | 0 | 35 | 20000 |
2 | 1 | 26 | 43000 |
3 | 1 | 27 | 57000 |
4 | 0 | 19 | 76000 |
# 切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=12)
# 重置训练集和测试集的索引
for i in [Xtrain,Xtest,Ytrain,Ytest]:
i.reset_index(drop=True,inplace=True)
clf = LR()
clf = clf.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf.score(Xtest,Ytest))
训练集上的预测准确率为: 0.6571428571428571
测试集上的预测准确率为: 0.6083333333333333
confusion_matrix(Ytrain,clf.predict(Xtrain))
array([[184, 0],
[ 96, 0]], dtype=int64)
area = roc_auc_score(Ytrain,clf.predict_proba(Xtrain)[:,1])
area
0.3393625452898551
print(classification_report(Ytrain,clf.predict(Xtrain)))
precision recall f1-score support
0 0.66 1.00 0.79 184
1 0.00 0.00 0.00 96
accuracy 0.66 280
macro avg 0.33 0.50 0.40 280
weighted avg 0.43 0.66 0.52 280
5 模型调优
5.1 数据量纲统一
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
# 对训练集进行归一化处理
mms = MinMaxScaler()
mms = mms.fit(Xtrain.iloc[:,1:])
Xtrain_mms = pd.DataFrame(mms.transform(Xtrain.iloc[:,1:]),columns=Xtrain.columns[1:])
Xtrain = pd.concat([Xtrain.iloc[:,0],Xtrain_mms],axis=1)
Xtrain.head()
Gender | Age | EstimatedSalary | |
---|---|---|---|
0 | 1 | 0.333333 | 1.000000 |
1 | 1 | 0.190476 | 0.148148 |
2 | 0 | 0.452381 | 0.274074 |
3 | 1 | 0.238095 | 0.511111 |
4 | 0 | 0.500000 | 0.674074 |
# 对测试集进行归一化处理
Xtest_mms = pd.DataFrame(mms.transform(Xtest.iloc[:,1:]),columns=Xtest.columns[1:])
Xtest = pd.concat([Xtest.iloc[:,0],Xtest_mms],axis=1)
Xtest.head()
Gender | Age | EstimatedSalary | |
---|---|---|---|
0 | 1 | 0.976190 | 0.540741 |
1 | 0 | 0.500000 | 0.414815 |
2 | 1 | 0.738095 | 0.933333 |
3 | 0 | 0.404762 | 0.037037 |
4 | 0 | 0.404762 | 0.088889 |
clf1 = LR()
clf1 = clf1.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf1.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf1.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8321428571428572
测试集上的预测准确率为: 0.8166666666666667
confusion_matrix(Ytrain,clf1.predict(Xtrain))
array([[177, 7],
[ 40, 56]], dtype=int64)
roc_auc_score(Ytrain,clf1.predict(Xtrain))
0.7726449275362319
print(classification_report(Ytrain,clf1.predict(Xtrain)))
precision recall f1-score support
0 0.82 0.96 0.88 184
1 0.89 0.58 0.70 96
accuracy 0.83 280
macro avg 0.85 0.77 0.79 280
weighted avg 0.84 0.83 0.82 280
5.2 使用交叉验证选取最优参数
# 绘制学习曲线选取最优参数C
s_mean = []
Crange = np.linspace(0.001,15,100)
for i in Crange:
clf = LR(C=i)
s_mean.append(cross_val_score(clf,Xtrain,Ytrain,cv=5).mean())
print(f"当C={Crange[s_mean.index(max(s_mean))]}时,准确率最高{max(s_mean)}")
plt.plot(Crange,s_mean,c="r");
当C=9.394313131313131时,准确率最高0.8714285714285713
# 使用最优参数建模
clf2 = LR(C=9.4)
clf2 = clf2.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf2.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf2.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8714285714285714
测试集上的预测准确率为: 0.825
confusion_matrix(Ytrain,clf2.predict(Xtrain))
array([[175, 9],
[ 27, 69]], dtype=int64)
roc_auc_score(Ytrain,clf2.predict(Xtrain))
0.8349184782608696
print(classification_report(Ytrain,clf2.predict(Xtrain)))
precision recall f1-score support
0 0.87 0.95 0.91 184
1 0.88 0.72 0.79 96
accuracy 0.87 280
macro avg 0.88 0.83 0.85 280
weighted avg 0.87 0.87 0.87 280
5.3 改变样本权重
clf3 = LR(C=9.4,class_weight="balanced")
clf3 = clf3.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf3.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf3.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8678571428571429
测试集上的预测准确率为: 0.8083333333333333
confusion_matrix(Ytrain,clf3.predict(Xtrain))
array([[160, 24],
[ 13, 83]], dtype=int64)
roc_auc_score(Ytrain,clf3.predict(Xtrain))
0.8670742753623188
print(classification_report(Ytrain,clf3.predict(Xtrain)))
precision recall f1-score support
0 0.92 0.87 0.90 184
1 0.78 0.86 0.82 96
accuracy 0.87 280
macro avg 0.85 0.87 0.86 280
weighted avg 0.87 0.87 0.87 280
# 如果是不惜一切代价提升Recall的话……
clf4 = LR(C=9.4,class_weight={1:10})
clf4 = clf4.fit(Xtrain,Ytrain)
print(classification_report(Ytrain,clf4.predict(Xtrain)))
precision recall f1-score support
0 0.99 0.59 0.74 184
1 0.56 0.99 0.71 96
accuracy 0.73 280
macro avg 0.77 0.79 0.73 280
weighted avg 0.84 0.73 0.73 280
6 得到最终模型
# 导入所需模块和包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
# 导入原始数据并进行数据预处理
df = pd.read_csv("data/Social_Network_Ads.csv")
df["Gender"] = df["Gender"].map({"Male":0,"Female":1})
# 提取特征矩阵和标签
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
# 切分训练集和测试集
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3,random_state=12)
# 重置训练集和测试集的索引
for i in [Xtrain,Xtest,Ytrain,Ytest]:
i.reset_index(drop=True,inplace=True)
# 对训练集进行归一化处理
mms = MinMaxScaler()
mms = mms.fit(Xtrain.iloc[:,1:])
Xtrain_mms = pd.DataFrame(mms.transform(Xtrain.iloc[:,1:]),columns=Xtrain.columns[1:])
Xtrain = pd.concat([Xtrain.iloc[:,0],Xtrain_mms],axis=1)
# 对测试集进行归一化处理
Xtest_mms = pd.DataFrame(mms.transform(Xtest.iloc[:,1:]),columns=Xtest.columns[1:])
Xtest = pd.concat([Xtest.iloc[:,0],Xtest_mms],axis=1)
clf = LR(C=9.4,class_weight="balanced")
clf = clf.fit(Xtrain,Ytrain)
print("训练集上的预测准确率为:",clf.score(Xtrain,Ytrain))
print("测试集上的预测准确率为:",clf.score(Xtest,Ytest))
训练集上的预测准确率为: 0.8678571428571429
测试集上的预测准确率为: 0.8083333333333333
confusion_matrix(Ytest,clf.predict(Xtest))
array([[55, 18],
[ 5, 42]], dtype=int64)
area = roc_auc_score(Ytest,clf.predict(Xtest))
area
0.8235208394054211
print(classification_report(Ytest,clf.predict(Xtest)))
precision recall f1-score support
0 0.92 0.75 0.83 73
1 0.70 0.89 0.79 47
accuracy 0.81 120
macro avg 0.81 0.82 0.81 120
weighted avg 0.83 0.81 0.81 120
FPR, recall, thresholds = roc_curve(Ytest,clf.predict_proba(Xtest)[:,1])
# 绘制ROC曲线
plt.figure()
plt.plot(FPR, recall, color='red',
label='ROC curve (area = %0.2f)' % area)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()